From eadd6996ef239b4b39923f5f3690ba7543fafb2d Mon Sep 17 00:00:00 2001 From: ta264 Date: Sat, 20 Oct 2018 23:09:12 +0100 Subject: [PATCH] Fuzzy matching (#508) * Fixed: correctly match albums whose title is all special characters * New: fuzzy matching on album and track names --- .../Extensions/StringExtensions.cs | 39 ++++++- .../AlbumRepositoryFixture.cs | 100 +++++++++++++++++- .../TitleMatchingFixture.cs | 84 ++++++++++----- src/NzbDrone.Core/Music/AlbumRepository.cs | 49 ++++++++- src/NzbDrone.Core/Music/AlbumService.cs | 6 +- src/NzbDrone.Core/Music/TrackService.cs | 39 +++++++ src/NzbDrone.Core/Parser/ParsingService.cs | 12 +++ 7 files changed, 293 insertions(+), 36 deletions(-) diff --git a/src/NzbDrone.Common/Extensions/StringExtensions.cs b/src/NzbDrone.Common/Extensions/StringExtensions.cs index 5eeed958d..40c7886f6 100644 --- a/src/NzbDrone.Common/Extensions/StringExtensions.cs +++ b/src/NzbDrone.Common/Extensions/StringExtensions.cs @@ -71,7 +71,7 @@ namespace NzbDrone.Common.Extensions return string.Join(separator, values); } - public static string CleanSpaces(this string text) + public static string CleanSpaces(this string text) { return CollapseSpace.Replace(text, " ").Trim(); } @@ -141,5 +141,42 @@ namespace NzbDrone.Common.Extensions return CamelCaseRegex.Replace(input, match => " " + match.Value); } + public static double FuzzyMatch(this string a, string b) + { + if (a.Contains(" ") && b.Contains(" ")) + { + var partsA = a.Split(' '); + var partsB = b.Split(' '); + var weightedHighCoefficients = new double[partsA.Length]; + var distanceRatios = new double[partsA.Length]; + for (int i = 0; i < partsA.Length; i++) + { + double high = 0.0; + int indexDistance = 0; + for (int x = 0; x < partsB.Length; x++) + { + var coef = LevenshteinCoefficient(partsA[i], partsB[x]); + if (coef > high) + { + high = coef; + indexDistance = Math.Abs(i - x); + } + } + double distanceWeight = 1.0 - (double)indexDistance / (double)partsA.Length; + weightedHighCoefficients[i] = high * distanceWeight; + } + return weightedHighCoefficients.Sum() / (double)partsA.Length; + } + else + { + return LevenshteinCoefficient(a, b); + } + } + + public static double LevenshteinCoefficient(this string a, string b) + { + return 1.0 - (double)a.LevenshteinDistance(b) / Math.Max(a.Length, b.Length); + } + } } diff --git a/src/NzbDrone.Core.Test/MusicTests/AlbumRepositoryTests/AlbumRepositoryFixture.cs b/src/NzbDrone.Core.Test/MusicTests/AlbumRepositoryTests/AlbumRepositoryFixture.cs index 66cd40904..d924bd3ab 100644 --- a/src/NzbDrone.Core.Test/MusicTests/AlbumRepositoryTests/AlbumRepositoryFixture.cs +++ b/src/NzbDrone.Core.Test/MusicTests/AlbumRepositoryTests/AlbumRepositoryFixture.cs @@ -8,6 +8,7 @@ using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; +using NLog; namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests { @@ -16,6 +17,8 @@ namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests { private Artist _artist; private Album _album; + private Album _albumSpecial; + private Album _albumSimilar; private AlbumRepository _albumRepo; [SetUp] @@ -29,12 +32,15 @@ namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests Id = 1 }; + _albumRepo = Mocker.Resolve(); + _album = new Album { Title = "ANThology", ForeignAlbumId = "1", CleanTitle = "anthology", Artist = _artist, + ArtistId = _artist.Id, AlbumType = "", Releases = new List { @@ -46,9 +52,46 @@ namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests }; - _albumRepo = Mocker.Resolve(); - _albumRepo.Insert(_album); + + _albumSpecial = new Album + { + Title = "+", + ForeignAlbumId = "2", + CleanTitle = "", + Artist = _artist, + ArtistId = _artist.Id, + AlbumType = "", + Releases = new List + { + new AlbumRelease + { + Id = "fake id" + } + } + + }; + + _albumRepo.Insert(_albumSpecial); + + _albumSimilar = new Album + { + Title = "ANThology2", + ForeignAlbumId = "3", + CleanTitle = "anthology2", + Artist = _artist, + ArtistId = _artist.Id, + AlbumType = "", + Releases = new List + { + new AlbumRelease + { + Id = "fake id 2" + } + } + + }; + } @@ -59,9 +102,62 @@ namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests var album = _albumRepo.FindAlbumByRelease(id); + album.Should().NotBeNull(); + album.Title.Should().Be(_album.Title); + } + + [TestCase("ANThology")] + [TestCase("anthology")] + [TestCase("anthology!")] + public void should_find_album_in_db_by_title(string title) + { + var album = _albumRepo.FindByTitle(_artist.Id, title); + + album.Should().NotBeNull(); album.Title.Should().Be(_album.Title); } + [Test] + public void should_find_album_in_db_by_title_all_special_characters() + { + var album = _albumRepo.FindByTitle(_artist.Id, "+"); + + album.Should().NotBeNull(); + album.Title.Should().Be(_albumSpecial.Title); + } + + [TestCase("ANTholog")] + [TestCase("nthology")] + [TestCase("antholoyg")] + public void should_not_find_album_in_db_by_incorrect_title(string title) + { + var album = _albumRepo.FindByTitle(_artist.Id, title); + + album.Should().BeNull(); + } + + [TestCase("ANTholog")] + [TestCase("antholoyg")] + [TestCase("ANThology CD")] + public void should_find_album_in_db_by_inexact_title(string title) + { + var album = _albumRepo.FindByTitleInexact(_artist.Id, title); + + album.Should().NotBeNull(); + album.Title.Should().Be(_album.Title); + } + + [TestCase("ANTholog")] + [TestCase("antholoyg")] + [TestCase("ANThology CD")] + public void should_not_find_album_in_db_by_inexact_title_when_two_similar_matches(string title) + { + _albumRepo.Insert(_albumSimilar); + var album = _albumRepo.FindByTitleInexact(_artist.Id, title); + + album.Should().BeNull(); + } + [Test] public void should_not_find_album_in_db_by_partial_releaseid() { diff --git a/src/NzbDrone.Core.Test/MusicTests/TitleMatchingTests/TitleMatchingFixture.cs b/src/NzbDrone.Core.Test/MusicTests/TitleMatchingTests/TitleMatchingFixture.cs index af429d0db..f9d10b1da 100644 --- a/src/NzbDrone.Core.Test/MusicTests/TitleMatchingTests/TitleMatchingFixture.cs +++ b/src/NzbDrone.Core.Test/MusicTests/TitleMatchingTests/TitleMatchingFixture.cs @@ -5,6 +5,7 @@ using NUnit.Framework; using NzbDrone.Core.Configuration; using NzbDrone.Core.Music; using NzbDrone.Core.Test.Framework; +using System.Collections.Generic; namespace NzbDrone.Core.Test.MusicTests.TitleMatchingTests { @@ -21,49 +22,82 @@ namespace NzbDrone.Core.Test.MusicTests.TitleMatchingTests _trackService = new TrackService(_trackRepository, Mocker.Resolve(), Mocker.Resolve()); - _trackRepository.Insert(new Track - { - Title = "This is the short test title", - ForeignTrackId = "this is a fake id2", - AlbumId = 4321, - AbsoluteTrackNumber = 1, - MediumNumber = 1, - TrackFileId = 1 - }); - - _trackRepository.Insert(new Track - { - Title = "This is the long test title", - ForeignTrackId = "this is a fake id", - AlbumId = 4321, - AbsoluteTrackNumber = 2, - MediumNumber = 1, - TrackFileId = 2 - }); + var trackNames = new List { + "Courage", + "Movies", + "Flesh and Bone", + "Whisper", + "Summer", + "Sticks and Stones", + "Attitude", + "Stranded", + "Wish", + "Calico", + "(Happy) Death Day", + "Smooth Criminal", + "Universe / Orange Appeal" + }; + + for (int i = 0; i < trackNames.Count; i++) { + _trackRepository.Insert(new Track + { + Title = trackNames[i], + ForeignTrackId = (i+1).ToString(), + AlbumId = 4321, + AbsoluteTrackNumber = i+1, + MediumNumber = 1, + TrackFileId = i+1 + }); + } } [Test] - public void should_find_track_in_db_by_tracktitle_longer_then_relaeasetitle() + public void should_find_track_in_db_by_tracktitle_longer_then_releasetitle() { - var track = _trackService.FindTrackByTitle(1234, 4321, 1, 1, "This is the short test title with some bla"); + var track = _trackService.FindTrackByTitle(1234, 4321, 1, 1, "Courage with some bla"); + track.Should().NotBeNull(); track.Title.Should().Be(_trackRepository.GetTracksByFileId(1).First().Title); } [Test] - public void should_find_track_in_db_by_tracktitle_shorter_then_relaeasetitle() + public void should_find_track_in_db_by_tracktitle_shorter_then_releasetitle() { - var track = _trackService.FindTrackByTitle(1234, 4321, 1, 2, "test title"); + var track = _trackService.FindTrackByTitle(1234, 4321, 1, 3, "and Bone"); - track.Title.Should().Be(_trackRepository.GetTracksByFileId(2).First().Title); + track.Should().NotBeNull(); + track.Title.Should().Be(_trackRepository.GetTracksByFileId(3).First().Title); } [Test] public void should_not_find_track_in_db_by_wrong_title() { - var track = _trackService.FindTrackByTitle(1234, 4321, 1, 1, "the short title"); + var track = _trackService.FindTrackByTitle(1234, 4321, 1, 1, "Not a track"); + + track.Should().BeNull(); + } + + [TestCase("Fesh and Bone", 3)] + [TestCase("Atitude", 7)] + [TestCase("Smoth cRimnal", 12)] + [TestCase("Sticks and Stones (live)", 6)] + public void should_find_track_in_db_by_inexact_title(string title, int trackNumber) + { + var track = _trackService.FindTrackByTitleInexact(1234, 4321, 1, trackNumber, title); + + track.Should().NotBeNull(); + track.Title.Should().Be(_trackRepository.GetTracksByFileId(trackNumber).First().Title); + } + + [TestCase("A random title", 1)] + [TestCase("Stones and Sticks", 6)] + public void should_not_find_track_in_db_by_different_inexact_title(string title, int trackId) + { + var track = _trackService.FindTrackByTitleInexact(1234, 4321, 1, trackId, title); track.Should().BeNull(); } + + } } diff --git a/src/NzbDrone.Core/Music/AlbumRepository.cs b/src/NzbDrone.Core/Music/AlbumRepository.cs index 5d1f4ae6f..db2144da1 100644 --- a/src/NzbDrone.Core/Music/AlbumRepository.cs +++ b/src/NzbDrone.Core/Music/AlbumRepository.cs @@ -1,5 +1,6 @@ using System; using System.Linq; +using NLog; using Marr.Data.QGen; using NzbDrone.Core.Datastore; using NzbDrone.Core.Datastore.Extensions; @@ -16,6 +17,7 @@ namespace NzbDrone.Core.Music List GetAlbums(int artistId); Album FindByName(string cleanTitle); Album FindByTitle(int artistId, string title); + Album FindByTitleInexact(int artistId, string title); Album FindByArtistAndName(string artistName, string cleanTitle); Album FindById(string spotifyId); PagingSpec AlbumsWithoutFiles(PagingSpec pagingSpec); @@ -31,14 +33,15 @@ namespace NzbDrone.Core.Music public class AlbumRepository : BasicRepository, IAlbumRepository { private readonly IMainDatabase _database; + private readonly Logger _logger; - public AlbumRepository(IMainDatabase database, IEventAggregator eventAggregator) + public AlbumRepository(IMainDatabase database, IEventAggregator eventAggregator, Logger logger) : base(database, eventAggregator) { _database = database; + _logger = logger; } - public List GetAlbums(int artistId) { return Query.Where(s => s.ArtistId == artistId).ToList(); @@ -287,13 +290,49 @@ namespace NzbDrone.Core.Music public Album FindByTitle(int artistId, string title) { - title = Parser.Parser.CleanArtistName(title); - - return Query.Where(s => s.CleanTitle == title) + var cleanTitle = Parser.Parser.CleanArtistName(title); + + if (string.IsNullOrEmpty(cleanTitle)) + cleanTitle = title; + + return Query.Where(s => s.CleanTitle == cleanTitle || s.Title == title) .AndWhere(s => s.ArtistId == artistId) .FirstOrDefault(); } + public Album FindByTitleInexact(int artistId, string title) + { + double fuzzThreshold = 0.7; + double fuzzGap = 0.4; + var cleanTitle = Parser.Parser.CleanArtistName(title); + + if (string.IsNullOrEmpty(cleanTitle)) + cleanTitle = title; + + var sortedAlbums = Query.Where(s => s.ArtistId == artistId) + .Select(s => new + { + MatchProb = s.CleanTitle.FuzzyMatch(cleanTitle), + Album = s + }) + .ToList() + .OrderByDescending(s => s.MatchProb) + .ToList(); + + if (!sortedAlbums.Any()) + return null; + + _logger.Trace("\nFuzzy album match on '{0}':\n{1}", + cleanTitle, + string.Join("\n", sortedAlbums.Select(x => $"{x.Album.CleanTitle}: {x.MatchProb}"))); + + if (sortedAlbums[0].MatchProb > fuzzThreshold + && (sortedAlbums.Count == 1 || sortedAlbums[0].MatchProb - sortedAlbums[1].MatchProb > fuzzGap)) + return sortedAlbums[0].Album; + + return null; + } + public Album FindByArtistAndName(string artistName, string cleanTitle) { var cleanArtistName = Parser.Parser.CleanArtistName(artistName); diff --git a/src/NzbDrone.Core/Music/AlbumService.cs b/src/NzbDrone.Core/Music/AlbumService.cs index 8b4774338..e5285c1bc 100644 --- a/src/NzbDrone.Core/Music/AlbumService.cs +++ b/src/NzbDrone.Core/Music/AlbumService.cs @@ -17,7 +17,7 @@ namespace NzbDrone.Core.Music List AddAlbums(List newAlbums); Album FindById(string spotifyId); Album FindByTitle(int artistId, string title); - Album FindByTitleInexact(string title); + Album FindByTitleInexact(int artistId, string title); void DeleteAlbum(int albumId, bool deleteFiles); List GetAllAlbums(); Album UpdateAlbum(Album album); @@ -87,9 +87,9 @@ namespace NzbDrone.Core.Music return _albumRepository.FindByTitle(artistId, title); } - public Album FindByTitleInexact(string title) + public Album FindByTitleInexact(int artistId, string title) { - throw new NotImplementedException(); + return _albumRepository.FindByTitleInexact(artistId, title); } public List GetAllAlbums() diff --git a/src/NzbDrone.Core/Music/TrackService.cs b/src/NzbDrone.Core/Music/TrackService.cs index 445281040..df8f8c813 100644 --- a/src/NzbDrone.Core/Music/TrackService.cs +++ b/src/NzbDrone.Core/Music/TrackService.cs @@ -5,6 +5,7 @@ using NzbDrone.Core.MediaFiles; using NzbDrone.Core.MediaFiles.Events; using NzbDrone.Core.Messaging.Events; using NzbDrone.Core.Music.Events; +using NzbDrone.Common.Extensions; using System; using System.Collections.Generic; using System.Linq; @@ -18,6 +19,7 @@ namespace NzbDrone.Core.Music List GetTracks(IEnumerable ids); Track FindTrack(int artistId, int albumId, int mediumNumber, int trackNumber); Track FindTrackByTitle(int artistId, int albumId, int mediumNumber, int trackNumber, string releaseTitle); + Track FindTrackByTitleInexact(int artistId, int albumId, int mediumNumber, int trackNumber, string releaseTitle); List GetTracksByArtist(int artistId); List GetTracksByAlbum(int albumId); //List GetTracksByAlbumTitle(string artistId, string albumTitle); @@ -100,6 +102,43 @@ namespace NzbDrone.Core.Music return matches.OrderByDescending(e => e.NormalizedLength).FirstOrDefault()?.Track; } + public Track FindTrackByTitleInexact(int artistId, int albumId, int mediumNumber, int trackNumber, string releaseTitle) + { + double fuzzThreshold = 0.6; + double fuzzGap = 0.2; + + var normalizedReleaseTitle = Parser.Parser.NormalizeTrackTitle(releaseTitle).Replace(".", " "); + var tracks = _trackRepository.GetTracksByMedium(albumId, mediumNumber); + + var matches = from track in tracks + let normalizedTitle = Parser.Parser.NormalizeTrackTitle(track.Title).Replace(".", " ") + let matchProb = normalizedTitle.FuzzyMatch(normalizedReleaseTitle) + where track.Title.Length > 0 + orderby matchProb descending + select new + { + MatchProb = matchProb, + NormalizedTitle = normalizedTitle, + Track = track + }; + + var matchList = matches.ToList(); + + if (!matchList.Any()) + return null; + + _logger.Trace("\nFuzzy track match on '{0}':\n{1}", + normalizedReleaseTitle, + string.Join("\n", matchList.Select(x => $"{x.NormalizedTitle}: {x.MatchProb}"))); + + if (matchList[0].MatchProb > fuzzThreshold + && (matchList.Count == 1 || matchList[0].MatchProb - matchList[1].MatchProb > fuzzGap) + && (trackNumber == 0 || matchList[0].Track.AbsoluteTrackNumber == trackNumber)) + return matchList[0].Track; + + return null; + } + public List TracksWithFiles(int artistId) { return _trackRepository.TracksWithFiles(artistId); diff --git a/src/NzbDrone.Core/Parser/ParsingService.cs b/src/NzbDrone.Core/Parser/ParsingService.cs index 9e0660867..67fb7832d 100644 --- a/src/NzbDrone.Core/Parser/ParsingService.cs +++ b/src/NzbDrone.Core/Parser/ParsingService.cs @@ -290,6 +290,12 @@ namespace NzbDrone.Core.Parser album = _albumService.FindByTitle(artist.Id, cleanAlbumTitle); } + if (album == null) + { + _logger.Debug("Trying inexact album match for {0}", parsedTrackInfo); + album = _albumService.FindByTitleInexact(artist.Id, cleanAlbumTitle); + } + if (album == null) { _logger.Debug("Parsed album title not found in Db for {0}", parsedTrackInfo); @@ -318,6 +324,12 @@ namespace NzbDrone.Core.Parser trackInfo = _trackService.FindTrackByTitle(artist.Id, album.Id, parsedTrackInfo.DiscNumber, parsedTrackInfo.TrackNumbers.FirstOrDefault(), parsedTrackInfo.Title); } + if (trackInfo == null) + { + _logger.Debug("Trying inexact track match for {0}", parsedTrackInfo); + trackInfo = _trackService.FindTrackByTitleInexact(artist.Id, album.Id, parsedTrackInfo.DiscNumber, parsedTrackInfo.TrackNumbers.FirstOrDefault(), cleanTrackTitle); + } + if (trackInfo != null) { _logger.Debug("Track {0} selected for {1}", trackInfo, parsedTrackInfo);