From 9f37b1c48445e88db4ddb8da3c63dbb319bb6440 Mon Sep 17 00:00:00 2001 From: ta264 Date: Tue, 30 Mar 2021 21:26:44 +0100 Subject: [PATCH] New: Get more candidates and include ISBN/ASIN in distance calculation --- .../Identification/CandidateService.cs | 203 ++++++++++++------ .../BookImport/Identification/Distance.cs | 2 + .../Identification/DistanceCalculator.cs | 14 ++ .../Identification/IdentificationService.cs | 27 ++- .../Goodreads/GoodreadsProxy.cs | 22 +- 5 files changed, 190 insertions(+), 78 deletions(-) diff --git a/src/NzbDrone.Core/MediaFiles/BookImport/Identification/CandidateService.cs b/src/NzbDrone.Core/MediaFiles/BookImport/Identification/CandidateService.cs index e0afa0b0b..99a980d7d 100644 --- a/src/NzbDrone.Core/MediaFiles/BookImport/Identification/CandidateService.cs +++ b/src/NzbDrone.Core/MediaFiles/BookImport/Identification/CandidateService.cs @@ -1,3 +1,4 @@ +using System; using System.Collections.Generic; using System.Linq; using NLog; @@ -12,7 +13,7 @@ namespace NzbDrone.Core.MediaFiles.BookImport.Identification public interface ICandidateService { List GetDbCandidatesFromTags(LocalEdition localEdition, IdentificationOverrides idOverrides, bool includeExisting); - List GetRemoteCandidates(LocalEdition localEdition); + IEnumerable GetRemoteCandidates(LocalEdition localEdition); } public class CandidateService : ICandidateService @@ -183,116 +184,194 @@ namespace NzbDrone.Core.MediaFiles.BookImport.Identification return candidateReleases; } - public List GetRemoteCandidates(LocalEdition localEdition) + public IEnumerable GetRemoteCandidates(LocalEdition localEdition) { // Gets candidate book releases from the metadata server. // Will eventually need adding locally if we find a match - var watch = System.Diagnostics.Stopwatch.StartNew(); - - List remoteBooks = null; - var candidates = new List(); + List remoteBooks; + var seenCandidates = new HashSet(); var isbns = localEdition.LocalBooks.Select(x => x.FileTrackInfo.Isbn).Distinct().ToList(); var asins = localEdition.LocalBooks.Select(x => x.FileTrackInfo.Asin).Distinct().ToList(); var goodreads = localEdition.LocalBooks.Select(x => x.FileTrackInfo.GoodreadsId).Distinct().ToList(); - try + // grab possibilities for all the IDs present + if (isbns.Count == 1 && isbns[0].IsNotNullOrWhiteSpace()) { - if (isbns.Count == 1 && isbns[0].IsNotNullOrWhiteSpace()) - { - _logger.Trace($"Searching by isbn {isbns[0]}"); + _logger.Trace($"Searching by isbn {isbns[0]}"); + try + { remoteBooks = _bookSearchService.SearchByIsbn(isbns[0]); } + catch (GoodreadsException e) + { + _logger.Info(e, "Skipping ISBN search due to Goodreads Error"); + remoteBooks = new List(); + } - // Calibre puts junk asins into books it creates so check for sensible length - if ((remoteBooks == null || !remoteBooks.Any()) && - asins.Count == 1 && - asins[0].IsNotNullOrWhiteSpace() && - asins[0].Length == 10) + foreach (var candidate in ToCandidates(remoteBooks, seenCandidates)) { - _logger.Trace($"Searching by asin {asins[0]}"); + yield return candidate; + } + } + if (asins.Count == 1 && + asins[0].IsNotNullOrWhiteSpace() && + asins[0].Length == 10) + { + _logger.Trace($"Searching by asin {asins[0]}"); + + try + { remoteBooks = _bookSearchService.SearchByAsin(asins[0]); } - - // if we don't have an independent ID, try a goodreads ID, but may have been matched to the wrong edition by calibre - if ((remoteBooks == null || !remoteBooks.Any()) && - goodreads.Count == 1 && - goodreads[0].IsNotNullOrWhiteSpace()) + catch (GoodreadsException e) { - if (int.TryParse(goodreads[0], out var id)) - { - _logger.Trace($"Searching by goodreads id {id}"); + _logger.Info(e, "Skipping ASIN search due to Goodreads Error"); + remoteBooks = new List(); + } - remoteBooks = _bookSearchService.SearchByGoodreadsId(id); - } + foreach (var candidate in ToCandidates(remoteBooks, seenCandidates)) + { + yield return candidate; } + } - // if no asin/isbn or no result, fall back to text search - if (remoteBooks == null || !remoteBooks.Any()) + if (goodreads.Count == 1 && + goodreads[0].IsNotNullOrWhiteSpace()) + { + if (int.TryParse(goodreads[0], out var id)) { - // fall back to author / book name search - List authorTags = new List(); + _logger.Trace($"Searching by goodreads id {id}"); - if (TrackGroupingService.IsVariousAuthors(localEdition.LocalBooks)) + try { - authorTags.Add("Various Authors"); + remoteBooks = _bookSearchService.SearchByGoodreadsId(id); } - else + catch (GoodreadsException e) { - authorTags.AddRange(localEdition.LocalBooks.MostCommon(x => x.FileTrackInfo.Authors)); + _logger.Info(e, "Skipping Goodreads ID search due to Goodreads Error"); + remoteBooks = new List(); } - var bookTag = localEdition.LocalBooks.MostCommon(x => x.FileTrackInfo.BookTitle) ?? ""; - - if (!authorTags.Any() || bookTag.IsNullOrWhiteSpace()) + foreach (var candidate in ToCandidates(remoteBooks, seenCandidates)) { - return candidates; + yield return candidate; } + } + } - foreach (var authorTag in authorTags) - { - remoteBooks = _bookSearchService.SearchForNewBook(bookTag, authorTag); - if (remoteBooks.Any()) - { - break; - } - } + // If we got an id result, stop + if (seenCandidates.Any()) + { + yield break; + } - if (!remoteBooks.Any()) - { - var bookSearch = _bookSearchService.SearchForNewBook(bookTag, null); - var authorSearch = authorTags.SelectMany(a => _bookSearchService.SearchForNewBook(a, null)); + // fall back to author / book name search + var authorTags = new List(); - remoteBooks = bookSearch.Concat(authorSearch).DistinctBy(x => x.ForeignBookId).ToList(); - } + if (TrackGroupingService.IsVariousAuthors(localEdition.LocalBooks)) + { + authorTags.Add("Various Authors"); + } + else + { + authorTags.AddRange(localEdition.LocalBooks.MostCommon(x => x.FileTrackInfo.Authors)); + } + + var bookTag = localEdition.LocalBooks.MostCommon(x => x.FileTrackInfo.BookTitle) ?? ""; + + // If no valid author or book tags, stop + if (!authorTags.Any() || bookTag.IsNullOrWhiteSpace()) + { + yield break; + } + + // Search by author+book + foreach (var authorTag in authorTags) + { + try + { + remoteBooks = _bookSearchService.SearchForNewBook(bookTag, authorTag); + } + catch (GoodreadsException e) + { + _logger.Info(e, "Skipping author/title search due to Goodreads Error"); + remoteBooks = new List(); + } + + foreach (var candidate in ToCandidates(remoteBooks, seenCandidates)) + { + yield return candidate; } } + + // If we got an author/book search result, stop + if (seenCandidates.Any()) + { + yield break; + } + + // Search by just book title + try + { + remoteBooks = _bookSearchService.SearchForNewBook(bookTag, null); + } catch (GoodreadsException e) { - _logger.Info(e, "Skipping book due to Goodreads error"); + _logger.Info(e, "Skipping book title search due to Goodreads Error"); remoteBooks = new List(); } - foreach (var book in remoteBooks) + foreach (var candidate in ToCandidates(remoteBooks, seenCandidates)) + { + yield return candidate; + } + + // Search by just author + foreach (var a in authorTags) + { + try + { + remoteBooks = _bookSearchService.SearchForNewBook(a, null); + } + catch (GoodreadsException e) + { + _logger.Info(e, "Skipping author search due to Goodreads Error"); + remoteBooks = new List(); + } + + foreach (var candidate in ToCandidates(remoteBooks, seenCandidates)) + { + yield return candidate; + } + } + } + + private List ToCandidates(IEnumerable books, HashSet seenCandidates) + { + var candidates = new List(); + + foreach (var book in books) { // We have to make sure various bits and pieces are populated that are normally handled // by a database lazy load foreach (var edition in book.Editions.Value) { - edition.Book = book; - candidates.Add(new CandidateEdition + if (!seenCandidates.Contains(edition.ForeignEditionId)) { - Edition = edition, - ExistingFiles = new List() - }); + seenCandidates.Add(edition.ForeignEditionId); + edition.Book = book; + candidates.Add(new CandidateEdition + { + Edition = edition, + ExistingFiles = new List() + }); + } } } - watch.Stop(); - _logger.Debug($"Getting {candidates.Count} remote candidates from tags for {localEdition.LocalBooks.Count} tracks took {watch.ElapsedMilliseconds}ms"); - return candidates; } } diff --git a/src/NzbDrone.Core/MediaFiles/BookImport/Identification/Distance.cs b/src/NzbDrone.Core/MediaFiles/BookImport/Identification/Distance.cs index 619ca1812..b2a13b6da 100644 --- a/src/NzbDrone.Core/MediaFiles/BookImport/Identification/Distance.cs +++ b/src/NzbDrone.Core/MediaFiles/BookImport/Identification/Distance.cs @@ -13,6 +13,8 @@ namespace NzbDrone.Core.MediaFiles.BookImport.Identification { "source", 2.0 }, { "author", 3.0 }, { "book", 3.0 }, + { "isbn", 10.0 }, + { "asin", 10.0 }, { "media_count", 1.0 }, { "media_format", 1.0 }, { "year", 1.0 }, diff --git a/src/NzbDrone.Core/MediaFiles/BookImport/Identification/DistanceCalculator.cs b/src/NzbDrone.Core/MediaFiles/BookImport/Identification/DistanceCalculator.cs index f6ecabb98..2448dfef5 100644 --- a/src/NzbDrone.Core/MediaFiles/BookImport/Identification/DistanceCalculator.cs +++ b/src/NzbDrone.Core/MediaFiles/BookImport/Identification/DistanceCalculator.cs @@ -64,6 +64,20 @@ namespace NzbDrone.Core.MediaFiles.BookImport.Identification dist.AddString("book", title, titleOptions); Logger.Trace("book: '{0}' vs '{1}'; {2}", title, titleOptions.ConcatToString("' or '"), dist.NormalizedDistance()); + var isbn = localTracks.MostCommon(x => x.FileTrackInfo.Isbn); + if (isbn.IsNotNullOrWhiteSpace() && edition.Isbn13.IsNotNullOrWhiteSpace()) + { + dist.AddBool("isbn", isbn != edition.Isbn13); + Logger.Trace("isbn: '{0}' vs '{1}'; {2}", isbn, edition.Isbn13, dist.NormalizedDistance()); + } + + var asin = localTracks.MostCommon(x => x.FileTrackInfo.Asin); + if (asin.IsNotNullOrWhiteSpace() && edition.Asin.IsNotNullOrWhiteSpace()) + { + dist.AddBool("asin", asin != edition.Asin); + Logger.Trace("asin: '{0}' vs '{1}'; {2}", asin, edition.Asin, dist.NormalizedDistance()); + } + // Year var localYear = localTracks.MostCommon(x => x.FileTrackInfo.Year); if (localYear > 0 && edition.ReleaseDate.HasValue) diff --git a/src/NzbDrone.Core/MediaFiles/BookImport/Identification/IdentificationService.cs b/src/NzbDrone.Core/MediaFiles/BookImport/Identification/IdentificationService.cs index e9a55c6b0..f41b6f386 100644 --- a/src/NzbDrone.Core/MediaFiles/BookImport/Identification/IdentificationService.cs +++ b/src/NzbDrone.Core/MediaFiles/BookImport/Identification/IdentificationService.cs @@ -116,14 +116,22 @@ namespace NzbDrone.Core.MediaFiles.BookImport.Identification { var watch = System.Diagnostics.Stopwatch.StartNew(); - var candidateReleases = _candidateService.GetDbCandidatesFromTags(localBookRelease, idOverrides, config.IncludeExisting); + IEnumerable candidateReleases = _candidateService.GetDbCandidatesFromTags(localBookRelease, idOverrides, config.IncludeExisting); - if (candidateReleases.Count == 0 && config.AddNewAuthors) + // convert all the TrackFiles that represent extra files to List + // local candidates are actually a list so this is fine to enumerate + var allLocalTracks = ToLocalTrack(candidateReleases + .SelectMany(x => x.ExistingFiles) + .DistinctBy(x => x.Path), localBookRelease); + + _logger.Debug($"Retrieved {allLocalTracks.Count} possible tracks in {watch.ElapsedMilliseconds}ms"); + + if (!candidateReleases.Any() && config.AddNewAuthors) { candidateReleases = _candidateService.GetRemoteCandidates(localBookRelease); } - if (candidateReleases.Count == 0) + if (!candidateReleases.Any()) { // can't find any candidates even after fingerprinting // populate the overrides and return @@ -137,15 +145,6 @@ namespace NzbDrone.Core.MediaFiles.BookImport.Identification return; } - _logger.Debug($"Got {candidateReleases.Count} candidates for {localBookRelease.LocalBooks.Count} tracks in {watch.ElapsedMilliseconds}ms"); - - // convert all the TrackFiles that represent extra files to List - var allLocalTracks = ToLocalTrack(candidateReleases - .SelectMany(x => x.ExistingFiles) - .DistinctBy(x => x.Path), localBookRelease); - - _logger.Debug($"Retrieved {allLocalTracks.Count} possible tracks in {watch.ElapsedMilliseconds}ms"); - GetBestRelease(localBookRelease, candidateReleases, allLocalTracks); _logger.Debug($"Best release found in {watch.ElapsedMilliseconds}ms"); @@ -155,11 +154,11 @@ namespace NzbDrone.Core.MediaFiles.BookImport.Identification _logger.Debug($"IdentifyRelease done in {watch.ElapsedMilliseconds}ms"); } - private void GetBestRelease(LocalEdition localBookRelease, List candidateReleases, List extraTracksOnDisk) + private void GetBestRelease(LocalEdition localBookRelease, IEnumerable candidateReleases, List extraTracksOnDisk) { var watch = System.Diagnostics.Stopwatch.StartNew(); - _logger.Debug("Matching {0} track files against {1} candidates", localBookRelease.TrackCount, candidateReleases.Count); + _logger.Debug("Matching {0} track files against candidates", localBookRelease.TrackCount); _logger.Trace("Processing files:\n{0}", string.Join("\n", localBookRelease.LocalBooks.Select(x => x.Path))); double bestDistance = 1.0; diff --git a/src/NzbDrone.Core/MetadataSource/Goodreads/GoodreadsProxy.cs b/src/NzbDrone.Core/MetadataSource/Goodreads/GoodreadsProxy.cs index 5e1941abe..db3031d37 100644 --- a/src/NzbDrone.Core/MetadataSource/Goodreads/GoodreadsProxy.cs +++ b/src/NzbDrone.Core/MetadataSource/Goodreads/GoodreadsProxy.cs @@ -418,12 +418,30 @@ namespace NzbDrone.Core.MetadataSource.Goodreads public List SearchByIsbn(string isbn) { - return SearchByField("isbn", isbn); + var result = SearchByField("isbn", isbn); + + // we don't get isbn back in search result, but if only one result assume the query was correct + // and add in the searched isbn + if (result.Count == 1 && result[0].Editions.Value.Count == 1) + { + result[0].Editions.Value[0].Isbn13 = isbn; + } + + return result; } public List SearchByAsin(string asin) { - return SearchByField("isbn", asin); + var result = SearchByField("asin", asin); + + // we don't get isbn back in search result, but if only one result assume the query was correct + // and add in the searched isbn + if (result.Count == 1 && result[0].Editions.Value.Count == 1) + { + result[0].Editions.Value[0].Asin = asin; + } + + return result; } public List SearchByGoodreadsId(int id)