Fuzzy matching (#508)

* Fixed: correctly match albums whose title is all special characters

* New: fuzzy matching on album and track names
pull/528/head
ta264 6 years ago committed by Qstick
parent 3ae079a541
commit eadd6996ef

@ -141,5 +141,42 @@ namespace NzbDrone.Common.Extensions
return CamelCaseRegex.Replace(input, match => " " + match.Value); return CamelCaseRegex.Replace(input, match => " " + match.Value);
} }
public static double FuzzyMatch(this string a, string b)
{
if (a.Contains(" ") && b.Contains(" "))
{
var partsA = a.Split(' ');
var partsB = b.Split(' ');
var weightedHighCoefficients = new double[partsA.Length];
var distanceRatios = new double[partsA.Length];
for (int i = 0; i < partsA.Length; i++)
{
double high = 0.0;
int indexDistance = 0;
for (int x = 0; x < partsB.Length; x++)
{
var coef = LevenshteinCoefficient(partsA[i], partsB[x]);
if (coef > high)
{
high = coef;
indexDistance = Math.Abs(i - x);
}
}
double distanceWeight = 1.0 - (double)indexDistance / (double)partsA.Length;
weightedHighCoefficients[i] = high * distanceWeight;
}
return weightedHighCoefficients.Sum() / (double)partsA.Length;
}
else
{
return LevenshteinCoefficient(a, b);
}
}
public static double LevenshteinCoefficient(this string a, string b)
{
return 1.0 - (double)a.LevenshteinDistance(b) / Math.Max(a.Length, b.Length);
}
} }
} }

@ -8,6 +8,7 @@ using System.Collections.Generic;
using System.Linq; using System.Linq;
using System.Text; using System.Text;
using System.Threading.Tasks; using System.Threading.Tasks;
using NLog;
namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests
{ {
@ -16,6 +17,8 @@ namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests
{ {
private Artist _artist; private Artist _artist;
private Album _album; private Album _album;
private Album _albumSpecial;
private Album _albumSimilar;
private AlbumRepository _albumRepo; private AlbumRepository _albumRepo;
[SetUp] [SetUp]
@ -29,12 +32,15 @@ namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests
Id = 1 Id = 1
}; };
_albumRepo = Mocker.Resolve<AlbumRepository>();
_album = new Album _album = new Album
{ {
Title = "ANThology", Title = "ANThology",
ForeignAlbumId = "1", ForeignAlbumId = "1",
CleanTitle = "anthology", CleanTitle = "anthology",
Artist = _artist, Artist = _artist,
ArtistId = _artist.Id,
AlbumType = "", AlbumType = "",
Releases = new List<AlbumRelease> Releases = new List<AlbumRelease>
{ {
@ -46,9 +52,46 @@ namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests
}; };
_albumRepo = Mocker.Resolve<AlbumRepository>();
_albumRepo.Insert(_album); _albumRepo.Insert(_album);
_albumSpecial = new Album
{
Title = "+",
ForeignAlbumId = "2",
CleanTitle = "",
Artist = _artist,
ArtistId = _artist.Id,
AlbumType = "",
Releases = new List<AlbumRelease>
{
new AlbumRelease
{
Id = "fake id"
}
}
};
_albumRepo.Insert(_albumSpecial);
_albumSimilar = new Album
{
Title = "ANThology2",
ForeignAlbumId = "3",
CleanTitle = "anthology2",
Artist = _artist,
ArtistId = _artist.Id,
AlbumType = "",
Releases = new List<AlbumRelease>
{
new AlbumRelease
{
Id = "fake id 2"
}
}
};
} }
@ -59,9 +102,62 @@ namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests
var album = _albumRepo.FindAlbumByRelease(id); var album = _albumRepo.FindAlbumByRelease(id);
album.Should().NotBeNull();
album.Title.Should().Be(_album.Title);
}
[TestCase("ANThology")]
[TestCase("anthology")]
[TestCase("anthology!")]
public void should_find_album_in_db_by_title(string title)
{
var album = _albumRepo.FindByTitle(_artist.Id, title);
album.Should().NotBeNull();
album.Title.Should().Be(_album.Title); album.Title.Should().Be(_album.Title);
} }
[Test]
public void should_find_album_in_db_by_title_all_special_characters()
{
var album = _albumRepo.FindByTitle(_artist.Id, "+");
album.Should().NotBeNull();
album.Title.Should().Be(_albumSpecial.Title);
}
[TestCase("ANTholog")]
[TestCase("nthology")]
[TestCase("antholoyg")]
public void should_not_find_album_in_db_by_incorrect_title(string title)
{
var album = _albumRepo.FindByTitle(_artist.Id, title);
album.Should().BeNull();
}
[TestCase("ANTholog")]
[TestCase("antholoyg")]
[TestCase("ANThology CD")]
public void should_find_album_in_db_by_inexact_title(string title)
{
var album = _albumRepo.FindByTitleInexact(_artist.Id, title);
album.Should().NotBeNull();
album.Title.Should().Be(_album.Title);
}
[TestCase("ANTholog")]
[TestCase("antholoyg")]
[TestCase("ANThology CD")]
public void should_not_find_album_in_db_by_inexact_title_when_two_similar_matches(string title)
{
_albumRepo.Insert(_albumSimilar);
var album = _albumRepo.FindByTitleInexact(_artist.Id, title);
album.Should().BeNull();
}
[Test] [Test]
public void should_not_find_album_in_db_by_partial_releaseid() public void should_not_find_album_in_db_by_partial_releaseid()
{ {

@ -5,6 +5,7 @@ using NUnit.Framework;
using NzbDrone.Core.Configuration; using NzbDrone.Core.Configuration;
using NzbDrone.Core.Music; using NzbDrone.Core.Music;
using NzbDrone.Core.Test.Framework; using NzbDrone.Core.Test.Framework;
using System.Collections.Generic;
namespace NzbDrone.Core.Test.MusicTests.TitleMatchingTests namespace NzbDrone.Core.Test.MusicTests.TitleMatchingTests
{ {
@ -21,49 +22,82 @@ namespace NzbDrone.Core.Test.MusicTests.TitleMatchingTests
_trackService = _trackService =
new TrackService(_trackRepository, Mocker.Resolve<ConfigService>(), Mocker.Resolve<Logger>()); new TrackService(_trackRepository, Mocker.Resolve<ConfigService>(), Mocker.Resolve<Logger>());
_trackRepository.Insert(new Track var trackNames = new List<string> {
{ "Courage",
Title = "This is the short test title", "Movies",
ForeignTrackId = "this is a fake id2", "Flesh and Bone",
AlbumId = 4321, "Whisper",
AbsoluteTrackNumber = 1, "Summer",
MediumNumber = 1, "Sticks and Stones",
TrackFileId = 1 "Attitude",
}); "Stranded",
"Wish",
"Calico",
"(Happy) Death Day",
"Smooth Criminal",
"Universe / Orange Appeal"
};
for (int i = 0; i < trackNames.Count; i++) {
_trackRepository.Insert(new Track _trackRepository.Insert(new Track
{ {
Title = "This is the long test title", Title = trackNames[i],
ForeignTrackId = "this is a fake id", ForeignTrackId = (i+1).ToString(),
AlbumId = 4321, AlbumId = 4321,
AbsoluteTrackNumber = 2, AbsoluteTrackNumber = i+1,
MediumNumber = 1, MediumNumber = 1,
TrackFileId = 2 TrackFileId = i+1
}); });
} }
}
[Test] [Test]
public void should_find_track_in_db_by_tracktitle_longer_then_relaeasetitle() public void should_find_track_in_db_by_tracktitle_longer_then_releasetitle()
{ {
var track = _trackService.FindTrackByTitle(1234, 4321, 1, 1, "This is the short test title with some bla"); var track = _trackService.FindTrackByTitle(1234, 4321, 1, 1, "Courage with some bla");
track.Should().NotBeNull();
track.Title.Should().Be(_trackRepository.GetTracksByFileId(1).First().Title); track.Title.Should().Be(_trackRepository.GetTracksByFileId(1).First().Title);
} }
[Test] [Test]
public void should_find_track_in_db_by_tracktitle_shorter_then_relaeasetitle() public void should_find_track_in_db_by_tracktitle_shorter_then_releasetitle()
{ {
var track = _trackService.FindTrackByTitle(1234, 4321, 1, 2, "test title"); var track = _trackService.FindTrackByTitle(1234, 4321, 1, 3, "and Bone");
track.Title.Should().Be(_trackRepository.GetTracksByFileId(2).First().Title); track.Should().NotBeNull();
track.Title.Should().Be(_trackRepository.GetTracksByFileId(3).First().Title);
} }
[Test] [Test]
public void should_not_find_track_in_db_by_wrong_title() public void should_not_find_track_in_db_by_wrong_title()
{ {
var track = _trackService.FindTrackByTitle(1234, 4321, 1, 1, "the short title"); var track = _trackService.FindTrackByTitle(1234, 4321, 1, 1, "Not a track");
track.Should().BeNull(); track.Should().BeNull();
} }
[TestCase("Fesh and Bone", 3)]
[TestCase("Atitude", 7)]
[TestCase("Smoth cRimnal", 12)]
[TestCase("Sticks and Stones (live)", 6)]
public void should_find_track_in_db_by_inexact_title(string title, int trackNumber)
{
var track = _trackService.FindTrackByTitleInexact(1234, 4321, 1, trackNumber, title);
track.Should().NotBeNull();
track.Title.Should().Be(_trackRepository.GetTracksByFileId(trackNumber).First().Title);
}
[TestCase("A random title", 1)]
[TestCase("Stones and Sticks", 6)]
public void should_not_find_track_in_db_by_different_inexact_title(string title, int trackId)
{
var track = _trackService.FindTrackByTitleInexact(1234, 4321, 1, trackId, title);
track.Should().BeNull();
}
} }
} }

@ -1,5 +1,6 @@
using System; using System;
using System.Linq; using System.Linq;
using NLog;
using Marr.Data.QGen; using Marr.Data.QGen;
using NzbDrone.Core.Datastore; using NzbDrone.Core.Datastore;
using NzbDrone.Core.Datastore.Extensions; using NzbDrone.Core.Datastore.Extensions;
@ -16,6 +17,7 @@ namespace NzbDrone.Core.Music
List<Album> GetAlbums(int artistId); List<Album> GetAlbums(int artistId);
Album FindByName(string cleanTitle); Album FindByName(string cleanTitle);
Album FindByTitle(int artistId, string title); Album FindByTitle(int artistId, string title);
Album FindByTitleInexact(int artistId, string title);
Album FindByArtistAndName(string artistName, string cleanTitle); Album FindByArtistAndName(string artistName, string cleanTitle);
Album FindById(string spotifyId); Album FindById(string spotifyId);
PagingSpec<Album> AlbumsWithoutFiles(PagingSpec<Album> pagingSpec); PagingSpec<Album> AlbumsWithoutFiles(PagingSpec<Album> pagingSpec);
@ -31,14 +33,15 @@ namespace NzbDrone.Core.Music
public class AlbumRepository : BasicRepository<Album>, IAlbumRepository public class AlbumRepository : BasicRepository<Album>, IAlbumRepository
{ {
private readonly IMainDatabase _database; private readonly IMainDatabase _database;
private readonly Logger _logger;
public AlbumRepository(IMainDatabase database, IEventAggregator eventAggregator) public AlbumRepository(IMainDatabase database, IEventAggregator eventAggregator, Logger logger)
: base(database, eventAggregator) : base(database, eventAggregator)
{ {
_database = database; _database = database;
_logger = logger;
} }
public List<Album> GetAlbums(int artistId) public List<Album> GetAlbums(int artistId)
{ {
return Query.Where(s => s.ArtistId == artistId).ToList(); return Query.Where(s => s.ArtistId == artistId).ToList();
@ -287,13 +290,49 @@ namespace NzbDrone.Core.Music
public Album FindByTitle(int artistId, string title) public Album FindByTitle(int artistId, string title)
{ {
title = Parser.Parser.CleanArtistName(title); var cleanTitle = Parser.Parser.CleanArtistName(title);
if (string.IsNullOrEmpty(cleanTitle))
cleanTitle = title;
return Query.Where(s => s.CleanTitle == title) return Query.Where(s => s.CleanTitle == cleanTitle || s.Title == title)
.AndWhere(s => s.ArtistId == artistId) .AndWhere(s => s.ArtistId == artistId)
.FirstOrDefault(); .FirstOrDefault();
} }
public Album FindByTitleInexact(int artistId, string title)
{
double fuzzThreshold = 0.7;
double fuzzGap = 0.4;
var cleanTitle = Parser.Parser.CleanArtistName(title);
if (string.IsNullOrEmpty(cleanTitle))
cleanTitle = title;
var sortedAlbums = Query.Where(s => s.ArtistId == artistId)
.Select(s => new
{
MatchProb = s.CleanTitle.FuzzyMatch(cleanTitle),
Album = s
})
.ToList()
.OrderByDescending(s => s.MatchProb)
.ToList();
if (!sortedAlbums.Any())
return null;
_logger.Trace("\nFuzzy album match on '{0}':\n{1}",
cleanTitle,
string.Join("\n", sortedAlbums.Select(x => $"{x.Album.CleanTitle}: {x.MatchProb}")));
if (sortedAlbums[0].MatchProb > fuzzThreshold
&& (sortedAlbums.Count == 1 || sortedAlbums[0].MatchProb - sortedAlbums[1].MatchProb > fuzzGap))
return sortedAlbums[0].Album;
return null;
}
public Album FindByArtistAndName(string artistName, string cleanTitle) public Album FindByArtistAndName(string artistName, string cleanTitle)
{ {
var cleanArtistName = Parser.Parser.CleanArtistName(artistName); var cleanArtistName = Parser.Parser.CleanArtistName(artistName);

@ -17,7 +17,7 @@ namespace NzbDrone.Core.Music
List<Album> AddAlbums(List<Album> newAlbums); List<Album> AddAlbums(List<Album> newAlbums);
Album FindById(string spotifyId); Album FindById(string spotifyId);
Album FindByTitle(int artistId, string title); Album FindByTitle(int artistId, string title);
Album FindByTitleInexact(string title); Album FindByTitleInexact(int artistId, string title);
void DeleteAlbum(int albumId, bool deleteFiles); void DeleteAlbum(int albumId, bool deleteFiles);
List<Album> GetAllAlbums(); List<Album> GetAllAlbums();
Album UpdateAlbum(Album album); Album UpdateAlbum(Album album);
@ -87,9 +87,9 @@ namespace NzbDrone.Core.Music
return _albumRepository.FindByTitle(artistId, title); return _albumRepository.FindByTitle(artistId, title);
} }
public Album FindByTitleInexact(string title) public Album FindByTitleInexact(int artistId, string title)
{ {
throw new NotImplementedException(); return _albumRepository.FindByTitleInexact(artistId, title);
} }
public List<Album> GetAllAlbums() public List<Album> GetAllAlbums()

@ -5,6 +5,7 @@ using NzbDrone.Core.MediaFiles;
using NzbDrone.Core.MediaFiles.Events; using NzbDrone.Core.MediaFiles.Events;
using NzbDrone.Core.Messaging.Events; using NzbDrone.Core.Messaging.Events;
using NzbDrone.Core.Music.Events; using NzbDrone.Core.Music.Events;
using NzbDrone.Common.Extensions;
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq; using System.Linq;
@ -18,6 +19,7 @@ namespace NzbDrone.Core.Music
List<Track> GetTracks(IEnumerable<int> ids); List<Track> GetTracks(IEnumerable<int> ids);
Track FindTrack(int artistId, int albumId, int mediumNumber, int trackNumber); Track FindTrack(int artistId, int albumId, int mediumNumber, int trackNumber);
Track FindTrackByTitle(int artistId, int albumId, int mediumNumber, int trackNumber, string releaseTitle); Track FindTrackByTitle(int artistId, int albumId, int mediumNumber, int trackNumber, string releaseTitle);
Track FindTrackByTitleInexact(int artistId, int albumId, int mediumNumber, int trackNumber, string releaseTitle);
List<Track> GetTracksByArtist(int artistId); List<Track> GetTracksByArtist(int artistId);
List<Track> GetTracksByAlbum(int albumId); List<Track> GetTracksByAlbum(int albumId);
//List<Track> GetTracksByAlbumTitle(string artistId, string albumTitle); //List<Track> GetTracksByAlbumTitle(string artistId, string albumTitle);
@ -100,6 +102,43 @@ namespace NzbDrone.Core.Music
return matches.OrderByDescending(e => e.NormalizedLength).FirstOrDefault()?.Track; return matches.OrderByDescending(e => e.NormalizedLength).FirstOrDefault()?.Track;
} }
public Track FindTrackByTitleInexact(int artistId, int albumId, int mediumNumber, int trackNumber, string releaseTitle)
{
double fuzzThreshold = 0.6;
double fuzzGap = 0.2;
var normalizedReleaseTitle = Parser.Parser.NormalizeTrackTitle(releaseTitle).Replace(".", " ");
var tracks = _trackRepository.GetTracksByMedium(albumId, mediumNumber);
var matches = from track in tracks
let normalizedTitle = Parser.Parser.NormalizeTrackTitle(track.Title).Replace(".", " ")
let matchProb = normalizedTitle.FuzzyMatch(normalizedReleaseTitle)
where track.Title.Length > 0
orderby matchProb descending
select new
{
MatchProb = matchProb,
NormalizedTitle = normalizedTitle,
Track = track
};
var matchList = matches.ToList();
if (!matchList.Any())
return null;
_logger.Trace("\nFuzzy track match on '{0}':\n{1}",
normalizedReleaseTitle,
string.Join("\n", matchList.Select(x => $"{x.NormalizedTitle}: {x.MatchProb}")));
if (matchList[0].MatchProb > fuzzThreshold
&& (matchList.Count == 1 || matchList[0].MatchProb - matchList[1].MatchProb > fuzzGap)
&& (trackNumber == 0 || matchList[0].Track.AbsoluteTrackNumber == trackNumber))
return matchList[0].Track;
return null;
}
public List<Track> TracksWithFiles(int artistId) public List<Track> TracksWithFiles(int artistId)
{ {
return _trackRepository.TracksWithFiles(artistId); return _trackRepository.TracksWithFiles(artistId);

@ -290,6 +290,12 @@ namespace NzbDrone.Core.Parser
album = _albumService.FindByTitle(artist.Id, cleanAlbumTitle); album = _albumService.FindByTitle(artist.Id, cleanAlbumTitle);
} }
if (album == null)
{
_logger.Debug("Trying inexact album match for {0}", parsedTrackInfo);
album = _albumService.FindByTitleInexact(artist.Id, cleanAlbumTitle);
}
if (album == null) if (album == null)
{ {
_logger.Debug("Parsed album title not found in Db for {0}", parsedTrackInfo); _logger.Debug("Parsed album title not found in Db for {0}", parsedTrackInfo);
@ -318,6 +324,12 @@ namespace NzbDrone.Core.Parser
trackInfo = _trackService.FindTrackByTitle(artist.Id, album.Id, parsedTrackInfo.DiscNumber, parsedTrackInfo.TrackNumbers.FirstOrDefault(), parsedTrackInfo.Title); trackInfo = _trackService.FindTrackByTitle(artist.Id, album.Id, parsedTrackInfo.DiscNumber, parsedTrackInfo.TrackNumbers.FirstOrDefault(), parsedTrackInfo.Title);
} }
if (trackInfo == null)
{
_logger.Debug("Trying inexact track match for {0}", parsedTrackInfo);
trackInfo = _trackService.FindTrackByTitleInexact(artist.Id, album.Id, parsedTrackInfo.DiscNumber, parsedTrackInfo.TrackNumbers.FirstOrDefault(), cleanTrackTitle);
}
if (trackInfo != null) if (trackInfo != null)
{ {
_logger.Debug("Track {0} selected for {1}", trackInfo, parsedTrackInfo); _logger.Debug("Track {0} selected for {1}", trackInfo, parsedTrackInfo);

Loading…
Cancel
Save