From 8bd356ab2077b5c1a90510e3e73b11698eca0331 Mon Sep 17 00:00:00 2001 From: Erik Rigtorp Date: Mon, 4 May 2020 20:19:10 -0700 Subject: [PATCH 1/5] Reduce number of TMDB lookups if filenames have punctuation chars Previosly TMDB would be queried with the raw name and always fail, then retry with the cleaned name. Now non-word chars are always cleaned out first. If first query fails, retry with more aggressive cleaning. --- .../Tmdb/Movies/TmdbSearch.cs | 67 +++++++++++-------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs b/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs index 223cef086b..08c1afec28 100644 --- a/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs +++ b/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs @@ -5,6 +5,7 @@ using System.Linq; using System.Net; using System.Threading; using System.Threading.Tasks; +using System.Text.RegularExpressions; using MediaBrowser.Common.Net; using MediaBrowser.Controller.Library; using MediaBrowser.Controller.Providers; @@ -19,6 +20,20 @@ namespace MediaBrowser.Providers.Tmdb.Movies public class TmdbSearch { private static readonly CultureInfo EnUs = new CultureInfo("en-US"); + + private static readonly Regex cleanEnclosed = new Regex(@"\p{Ps}.*\p{Pe}", RegexOptions.Compiled); + private static readonly Regex cleanNonWord = new Regex(@"[\W_]+", RegexOptions.Compiled); + private static readonly Regex cleanStopWords = new Regex(@"\b( # Start at word boundary + 19[0-9]{2}|20[0-9]{2}| # 1900-2099 + S[0-9]{2}| # Season + E[0-9]{2}| # Episode + (2160|1080|720|576|480)[ip]?| # Resolution + [xh]?264| # Encoding + (web|dvd|bd|hdtv|hd)rip| # *Rip + web|hdtv|mp4|bluray|ktr|dl|single|imageset|internal|doku|dubbed|retail|xxx|flac + ).* # Match rest of string", + RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase); + private const string Search3 = TmdbUtils.BaseTmdbApiUrl + @"3/search/{3}?api_key={1}&query={0}&language={2}"; private readonly ILogger _logger; @@ -61,19 +76,18 @@ namespace MediaBrowser.Providers.Tmdb.Movies var tmdbImageUrl = tmdbSettings.images.GetImageUrl("original"); - if (!string.IsNullOrWhiteSpace(name)) - { - var parsedName = _libraryManager.ParseName(name); - var yearInName = parsedName.Year; - name = parsedName.Name; - year = year ?? yearInName; - } + // Does this mean we are reparsing already parsed ItemLookupInfo? + var parsedName = _libraryManager.ParseName(name); + var yearInName = parsedName.Year; + name = parsedName.Name; + year = year ?? yearInName; - _logger.LogInformation("MovieDbProvider: Finding id for item: " + name); + _logger.LogInformation("TmdbSearch: Finding id for item: {0} ({1})", name, year); var language = idInfo.MetadataLanguage.ToLowerInvariant(); - //nope - search for it - //var searchType = item is BoxSet ? "collection" : "movie"; + // Replace sequences of non-word characters with space + // TMDB expects a space separated list of words make sure that is the case + name = cleanNonWord.Replace(name, " ").Trim(); var results = await GetSearchResults(name, searchType, year, language, tmdbImageUrl, cancellationToken).ConfigureAwait(false); @@ -86,36 +100,35 @@ namespace MediaBrowser.Providers.Tmdb.Movies } } + // Ideally retrying alternatives should be done outside the search + // provider so that the retry logic can be common for all search + // providers if (results.Count == 0) { - // try with dot and _ turned to space - var originalName = name; - - name = name.Replace(",", " "); - name = name.Replace(".", " "); - name = name.Replace("_", " "); - name = name.Replace("-", " "); - name = name.Replace("!", " "); - name = name.Replace("?", " "); - - var parenthIndex = name.IndexOf('('); - if (parenthIndex != -1) - { - name = name.Substring(0, parenthIndex); - } + name = parsedName.Name; + + // Remove things enclosed in []{}() etc + name = cleanEnclosed.Replace(name, string.Empty); + // Replace sequences of non-word characters with space + name = cleanNonWord.Replace(name, " "); + + // Clean based on common stop words / tokens + name = cleanStopWords.Replace(name, string.Empty); + + // Trim whitespace name = name.Trim(); // Search again if the new name is different - if (!string.Equals(name, originalName)) + if (!string.Equals(name, parsedName.Name) && !string.IsNullOrWhiteSpace(name)) { + _logger.LogInformation("TmdbSearch: Finding id for item: {0} ({1})", name, year); results = await GetSearchResults(name, searchType, year, language, tmdbImageUrl, cancellationToken).ConfigureAwait(false); if (results.Count == 0 && !string.Equals(language, "en", StringComparison.OrdinalIgnoreCase)) { //one more time, in english results = await GetSearchResults(name, searchType, year, "en", tmdbImageUrl, cancellationToken).ConfigureAwait(false); - } } } From f7c44565fc8f7cdaa7e7c95f174227ab90dd4afe Mon Sep 17 00:00:00 2001 From: Erik Rigtorp Date: Thu, 7 May 2020 15:47:46 -0700 Subject: [PATCH 2/5] Rename member variables to conform to coding standard --- .../Tmdb/Movies/TmdbSearch.cs | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs b/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs index 08c1afec28..47d5012471 100644 --- a/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs +++ b/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs @@ -19,11 +19,11 @@ namespace MediaBrowser.Providers.Tmdb.Movies { public class TmdbSearch { - private static readonly CultureInfo EnUs = new CultureInfo("en-US"); + private static readonly CultureInfo _usCulture = new CultureInfo("en-US"); - private static readonly Regex cleanEnclosed = new Regex(@"\p{Ps}.*\p{Pe}", RegexOptions.Compiled); - private static readonly Regex cleanNonWord = new Regex(@"[\W_]+", RegexOptions.Compiled); - private static readonly Regex cleanStopWords = new Regex(@"\b( # Start at word boundary + private static readonly Regex _cleanEnclosed = new Regex(@"\p{Ps}.*\p{Pe}", RegexOptions.Compiled); + private static readonly Regex _cleanNonWord = new Regex(@"[\W_]+", RegexOptions.Compiled); + private static readonly Regex _cleanStopWords = new Regex(@"\b( # Start at word boundary 19[0-9]{2}|20[0-9]{2}| # 1900-2099 S[0-9]{2}| # Season E[0-9]{2}| # Episode @@ -34,7 +34,7 @@ namespace MediaBrowser.Providers.Tmdb.Movies ).* # Match rest of string", RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase); - private const string Search3 = TmdbUtils.BaseTmdbApiUrl + @"3/search/{3}?api_key={1}&query={0}&language={2}"; + private const string _searchURL = TmdbUtils.BaseTmdbApiUrl + @"3/search/{3}?api_key={1}&query={0}&language={2}"; private readonly ILogger _logger; private readonly IJsonSerializer _json; @@ -87,7 +87,7 @@ namespace MediaBrowser.Providers.Tmdb.Movies // Replace sequences of non-word characters with space // TMDB expects a space separated list of words make sure that is the case - name = cleanNonWord.Replace(name, " ").Trim(); + name = _cleanNonWord.Replace(name, " ").Trim(); var results = await GetSearchResults(name, searchType, year, language, tmdbImageUrl, cancellationToken).ConfigureAwait(false); @@ -108,13 +108,13 @@ namespace MediaBrowser.Providers.Tmdb.Movies name = parsedName.Name; // Remove things enclosed in []{}() etc - name = cleanEnclosed.Replace(name, string.Empty); + name = _cleanEnclosed.Replace(name, string.Empty); // Replace sequences of non-word characters with space - name = cleanNonWord.Replace(name, " "); + name = _cleanNonWord.Replace(name, " "); // Clean based on common stop words / tokens - name = cleanStopWords.Replace(name, string.Empty); + name = _cleanStopWords.Replace(name, string.Empty); // Trim whitespace name = name.Trim(); @@ -163,7 +163,7 @@ namespace MediaBrowser.Providers.Tmdb.Movies throw new ArgumentException("name"); } - var url3 = string.Format(Search3, WebUtility.UrlEncode(name), TmdbUtils.ApiKey, language, type); + var url3 = string.Format(_searchURL, WebUtility.UrlEncode(name), TmdbUtils.ApiKey, language, type); using (var response = await TmdbMovieProvider.Current.GetMovieDbResponse(new HttpRequestOptions { @@ -192,14 +192,14 @@ namespace MediaBrowser.Providers.Tmdb.Movies if (!string.IsNullOrWhiteSpace(i.Release_Date)) { // These dates are always in this exact format - if (DateTime.TryParseExact(i.Release_Date, "yyyy-MM-dd", EnUs, DateTimeStyles.None, out var r)) + if (DateTime.TryParseExact(i.Release_Date, "yyyy-MM-dd", _usCulture, DateTimeStyles.None, out var r)) { remoteResult.PremiereDate = r.ToUniversalTime(); remoteResult.ProductionYear = remoteResult.PremiereDate.Value.Year; } } - remoteResult.SetProviderId(MetadataProviders.Tmdb, i.Id.ToString(EnUs)); + remoteResult.SetProviderId(MetadataProviders.Tmdb, i.Id.ToString(_usCulture)); return remoteResult; @@ -216,7 +216,7 @@ namespace MediaBrowser.Providers.Tmdb.Movies throw new ArgumentException("name"); } - var url3 = string.Format(Search3, WebUtility.UrlEncode(name), TmdbUtils.ApiKey, language, "tv"); + var url3 = string.Format(_searchURL, WebUtility.UrlEncode(name), TmdbUtils.ApiKey, language, "tv"); using (var response = await TmdbMovieProvider.Current.GetMovieDbResponse(new HttpRequestOptions { @@ -245,14 +245,14 @@ namespace MediaBrowser.Providers.Tmdb.Movies if (!string.IsNullOrWhiteSpace(i.First_Air_Date)) { // These dates are always in this exact format - if (DateTime.TryParseExact(i.First_Air_Date, "yyyy-MM-dd", EnUs, DateTimeStyles.None, out var r)) + if (DateTime.TryParseExact(i.First_Air_Date, "yyyy-MM-dd", _usCulture, DateTimeStyles.None, out var r)) { remoteResult.PremiereDate = r.ToUniversalTime(); remoteResult.ProductionYear = remoteResult.PremiereDate.Value.Year; } } - remoteResult.SetProviderId(MetadataProviders.Tmdb, i.Id.ToString(EnUs)); + remoteResult.SetProviderId(MetadataProviders.Tmdb, i.Id.ToString(_usCulture)); return remoteResult; From fa1fef109911c734657f854f259681000a75f13a Mon Sep 17 00:00:00 2001 From: Erik Rigtorp Date: Thu, 14 May 2020 11:56:25 -0700 Subject: [PATCH 3/5] Update MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs Co-authored-by: Vasily --- MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs b/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs index 47d5012471..b6c7226597 100644 --- a/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs +++ b/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs @@ -80,7 +80,7 @@ namespace MediaBrowser.Providers.Tmdb.Movies var parsedName = _libraryManager.ParseName(name); var yearInName = parsedName.Year; name = parsedName.Name; - year = year ?? yearInName; + year ??= yearInName; _logger.LogInformation("TmdbSearch: Finding id for item: {0} ({1})", name, year); var language = idInfo.MetadataLanguage.ToLowerInvariant(); From de351839033815ad0e1ee15e3e0b5cc095065d25 Mon Sep 17 00:00:00 2001 From: Erik Rigtorp Date: Thu, 14 May 2020 11:56:31 -0700 Subject: [PATCH 4/5] Update MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs Co-authored-by: Vasily --- MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs b/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs index b6c7226597..aa42fd81ab 100644 --- a/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs +++ b/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs @@ -100,7 +100,7 @@ namespace MediaBrowser.Providers.Tmdb.Movies } } - // Ideally retrying alternatives should be done outside the search + // TODO: retrying alternatives should be done outside the search // provider so that the retry logic can be common for all search // providers if (results.Count == 0) From e02e041b231dbe2b158fa1c75098bdd08e0abad1 Mon Sep 17 00:00:00 2001 From: Erik Rigtorp Date: Thu, 14 May 2020 16:55:55 -0700 Subject: [PATCH 5/5] If second cleaning results in same name skip lookup --- .../Tmdb/Movies/TmdbSearch.cs | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs b/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs index aa42fd81ab..bf63946084 100644 --- a/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs +++ b/MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs @@ -76,7 +76,7 @@ namespace MediaBrowser.Providers.Tmdb.Movies var tmdbImageUrl = tmdbSettings.images.GetImageUrl("original"); - // Does this mean we are reparsing already parsed ItemLookupInfo? + // TODO: Investigate: Does this mean we are reparsing already parsed ItemLookupInfo? var parsedName = _libraryManager.ParseName(name); var yearInName = parsedName.Year; name = parsedName.Name; @@ -105,30 +105,30 @@ namespace MediaBrowser.Providers.Tmdb.Movies // providers if (results.Count == 0) { - name = parsedName.Name; + var name2 = parsedName.Name; // Remove things enclosed in []{}() etc - name = _cleanEnclosed.Replace(name, string.Empty); + name2 = _cleanEnclosed.Replace(name2, string.Empty); // Replace sequences of non-word characters with space - name = _cleanNonWord.Replace(name, " "); + name2 = _cleanNonWord.Replace(name2, " "); // Clean based on common stop words / tokens - name = _cleanStopWords.Replace(name, string.Empty); + name2 = _cleanStopWords.Replace(name2, string.Empty); // Trim whitespace - name = name.Trim(); + name2 = name2.Trim(); // Search again if the new name is different - if (!string.Equals(name, parsedName.Name) && !string.IsNullOrWhiteSpace(name)) + if (!string.Equals(name2, name) && !string.IsNullOrWhiteSpace(name2)) { - _logger.LogInformation("TmdbSearch: Finding id for item: {0} ({1})", name, year); - results = await GetSearchResults(name, searchType, year, language, tmdbImageUrl, cancellationToken).ConfigureAwait(false); + _logger.LogInformation("TmdbSearch: Finding id for item: {0} ({1})", name2, year); + results = await GetSearchResults(name2, searchType, year, language, tmdbImageUrl, cancellationToken).ConfigureAwait(false); if (results.Count == 0 && !string.Equals(language, "en", StringComparison.OrdinalIgnoreCase)) { //one more time, in english - results = await GetSearchResults(name, searchType, year, "en", tmdbImageUrl, cancellationToken).ConfigureAwait(false); + results = await GetSearchResults(name2, searchType, year, "en", tmdbImageUrl, cancellationToken).ConfigureAwait(false); } } }