Fuzzy matching (#508)

* Fixed: correctly match albums whose title is all special characters

* New: fuzzy matching on album and track names
pull/528/head
ta264 6 years ago committed by Qstick
parent 3ae079a541
commit eadd6996ef

@ -71,7 +71,7 @@ namespace NzbDrone.Common.Extensions
return string.Join(separator, values);
}
public static string CleanSpaces(this string text)
public static string CleanSpaces(this string text)
{
return CollapseSpace.Replace(text, " ").Trim();
}
@ -141,5 +141,42 @@ namespace NzbDrone.Common.Extensions
return CamelCaseRegex.Replace(input, match => " " + match.Value);
}
public static double FuzzyMatch(this string a, string b)
{
if (a.Contains(" ") && b.Contains(" "))
{
var partsA = a.Split(' ');
var partsB = b.Split(' ');
var weightedHighCoefficients = new double[partsA.Length];
var distanceRatios = new double[partsA.Length];
for (int i = 0; i < partsA.Length; i++)
{
double high = 0.0;
int indexDistance = 0;
for (int x = 0; x < partsB.Length; x++)
{
var coef = LevenshteinCoefficient(partsA[i], partsB[x]);
if (coef > high)
{
high = coef;
indexDistance = Math.Abs(i - x);
}
}
double distanceWeight = 1.0 - (double)indexDistance / (double)partsA.Length;
weightedHighCoefficients[i] = high * distanceWeight;
}
return weightedHighCoefficients.Sum() / (double)partsA.Length;
}
else
{
return LevenshteinCoefficient(a, b);
}
}
public static double LevenshteinCoefficient(this string a, string b)
{
return 1.0 - (double)a.LevenshteinDistance(b) / Math.Max(a.Length, b.Length);
}
}
}

@ -8,6 +8,7 @@ using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using NLog;
namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests
{
@ -16,6 +17,8 @@ namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests
{
private Artist _artist;
private Album _album;
private Album _albumSpecial;
private Album _albumSimilar;
private AlbumRepository _albumRepo;
[SetUp]
@ -29,12 +32,15 @@ namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests
Id = 1
};
_albumRepo = Mocker.Resolve<AlbumRepository>();
_album = new Album
{
Title = "ANThology",
ForeignAlbumId = "1",
CleanTitle = "anthology",
Artist = _artist,
ArtistId = _artist.Id,
AlbumType = "",
Releases = new List<AlbumRelease>
{
@ -46,9 +52,46 @@ namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests
};
_albumRepo = Mocker.Resolve<AlbumRepository>();
_albumRepo.Insert(_album);
_albumSpecial = new Album
{
Title = "+",
ForeignAlbumId = "2",
CleanTitle = "",
Artist = _artist,
ArtistId = _artist.Id,
AlbumType = "",
Releases = new List<AlbumRelease>
{
new AlbumRelease
{
Id = "fake id"
}
}
};
_albumRepo.Insert(_albumSpecial);
_albumSimilar = new Album
{
Title = "ANThology2",
ForeignAlbumId = "3",
CleanTitle = "anthology2",
Artist = _artist,
ArtistId = _artist.Id,
AlbumType = "",
Releases = new List<AlbumRelease>
{
new AlbumRelease
{
Id = "fake id 2"
}
}
};
}
@ -59,9 +102,62 @@ namespace NzbDrone.Core.Test.MusicTests.AlbumRepositoryTests
var album = _albumRepo.FindAlbumByRelease(id);
album.Should().NotBeNull();
album.Title.Should().Be(_album.Title);
}
[TestCase("ANThology")]
[TestCase("anthology")]
[TestCase("anthology!")]
public void should_find_album_in_db_by_title(string title)
{
var album = _albumRepo.FindByTitle(_artist.Id, title);
album.Should().NotBeNull();
album.Title.Should().Be(_album.Title);
}
[Test]
public void should_find_album_in_db_by_title_all_special_characters()
{
var album = _albumRepo.FindByTitle(_artist.Id, "+");
album.Should().NotBeNull();
album.Title.Should().Be(_albumSpecial.Title);
}
[TestCase("ANTholog")]
[TestCase("nthology")]
[TestCase("antholoyg")]
public void should_not_find_album_in_db_by_incorrect_title(string title)
{
var album = _albumRepo.FindByTitle(_artist.Id, title);
album.Should().BeNull();
}
[TestCase("ANTholog")]
[TestCase("antholoyg")]
[TestCase("ANThology CD")]
public void should_find_album_in_db_by_inexact_title(string title)
{
var album = _albumRepo.FindByTitleInexact(_artist.Id, title);
album.Should().NotBeNull();
album.Title.Should().Be(_album.Title);
}
[TestCase("ANTholog")]
[TestCase("antholoyg")]
[TestCase("ANThology CD")]
public void should_not_find_album_in_db_by_inexact_title_when_two_similar_matches(string title)
{
_albumRepo.Insert(_albumSimilar);
var album = _albumRepo.FindByTitleInexact(_artist.Id, title);
album.Should().BeNull();
}
[Test]
public void should_not_find_album_in_db_by_partial_releaseid()
{

@ -5,6 +5,7 @@ using NUnit.Framework;
using NzbDrone.Core.Configuration;
using NzbDrone.Core.Music;
using NzbDrone.Core.Test.Framework;
using System.Collections.Generic;
namespace NzbDrone.Core.Test.MusicTests.TitleMatchingTests
{
@ -21,49 +22,82 @@ namespace NzbDrone.Core.Test.MusicTests.TitleMatchingTests
_trackService =
new TrackService(_trackRepository, Mocker.Resolve<ConfigService>(), Mocker.Resolve<Logger>());
_trackRepository.Insert(new Track
{
Title = "This is the short test title",
ForeignTrackId = "this is a fake id2",
AlbumId = 4321,
AbsoluteTrackNumber = 1,
MediumNumber = 1,
TrackFileId = 1
});
_trackRepository.Insert(new Track
{
Title = "This is the long test title",
ForeignTrackId = "this is a fake id",
AlbumId = 4321,
AbsoluteTrackNumber = 2,
MediumNumber = 1,
TrackFileId = 2
});
var trackNames = new List<string> {
"Courage",
"Movies",
"Flesh and Bone",
"Whisper",
"Summer",
"Sticks and Stones",
"Attitude",
"Stranded",
"Wish",
"Calico",
"(Happy) Death Day",
"Smooth Criminal",
"Universe / Orange Appeal"
};
for (int i = 0; i < trackNames.Count; i++) {
_trackRepository.Insert(new Track
{
Title = trackNames[i],
ForeignTrackId = (i+1).ToString(),
AlbumId = 4321,
AbsoluteTrackNumber = i+1,
MediumNumber = 1,
TrackFileId = i+1
});
}
}
[Test]
public void should_find_track_in_db_by_tracktitle_longer_then_relaeasetitle()
public void should_find_track_in_db_by_tracktitle_longer_then_releasetitle()
{
var track = _trackService.FindTrackByTitle(1234, 4321, 1, 1, "This is the short test title with some bla");
var track = _trackService.FindTrackByTitle(1234, 4321, 1, 1, "Courage with some bla");
track.Should().NotBeNull();
track.Title.Should().Be(_trackRepository.GetTracksByFileId(1).First().Title);
}
[Test]
public void should_find_track_in_db_by_tracktitle_shorter_then_relaeasetitle()
public void should_find_track_in_db_by_tracktitle_shorter_then_releasetitle()
{
var track = _trackService.FindTrackByTitle(1234, 4321, 1, 2, "test title");
var track = _trackService.FindTrackByTitle(1234, 4321, 1, 3, "and Bone");
track.Title.Should().Be(_trackRepository.GetTracksByFileId(2).First().Title);
track.Should().NotBeNull();
track.Title.Should().Be(_trackRepository.GetTracksByFileId(3).First().Title);
}
[Test]
public void should_not_find_track_in_db_by_wrong_title()
{
var track = _trackService.FindTrackByTitle(1234, 4321, 1, 1, "the short title");
var track = _trackService.FindTrackByTitle(1234, 4321, 1, 1, "Not a track");
track.Should().BeNull();
}
[TestCase("Fesh and Bone", 3)]
[TestCase("Atitude", 7)]
[TestCase("Smoth cRimnal", 12)]
[TestCase("Sticks and Stones (live)", 6)]
public void should_find_track_in_db_by_inexact_title(string title, int trackNumber)
{
var track = _trackService.FindTrackByTitleInexact(1234, 4321, 1, trackNumber, title);
track.Should().NotBeNull();
track.Title.Should().Be(_trackRepository.GetTracksByFileId(trackNumber).First().Title);
}
[TestCase("A random title", 1)]
[TestCase("Stones and Sticks", 6)]
public void should_not_find_track_in_db_by_different_inexact_title(string title, int trackId)
{
var track = _trackService.FindTrackByTitleInexact(1234, 4321, 1, trackId, title);
track.Should().BeNull();
}
}
}

@ -1,5 +1,6 @@
using System;
using System.Linq;
using NLog;
using Marr.Data.QGen;
using NzbDrone.Core.Datastore;
using NzbDrone.Core.Datastore.Extensions;
@ -16,6 +17,7 @@ namespace NzbDrone.Core.Music
List<Album> GetAlbums(int artistId);
Album FindByName(string cleanTitle);
Album FindByTitle(int artistId, string title);
Album FindByTitleInexact(int artistId, string title);
Album FindByArtistAndName(string artistName, string cleanTitle);
Album FindById(string spotifyId);
PagingSpec<Album> AlbumsWithoutFiles(PagingSpec<Album> pagingSpec);
@ -31,14 +33,15 @@ namespace NzbDrone.Core.Music
public class AlbumRepository : BasicRepository<Album>, IAlbumRepository
{
private readonly IMainDatabase _database;
private readonly Logger _logger;
public AlbumRepository(IMainDatabase database, IEventAggregator eventAggregator)
public AlbumRepository(IMainDatabase database, IEventAggregator eventAggregator, Logger logger)
: base(database, eventAggregator)
{
_database = database;
_logger = logger;
}
public List<Album> GetAlbums(int artistId)
{
return Query.Where(s => s.ArtistId == artistId).ToList();
@ -287,13 +290,49 @@ namespace NzbDrone.Core.Music
public Album FindByTitle(int artistId, string title)
{
title = Parser.Parser.CleanArtistName(title);
return Query.Where(s => s.CleanTitle == title)
var cleanTitle = Parser.Parser.CleanArtistName(title);
if (string.IsNullOrEmpty(cleanTitle))
cleanTitle = title;
return Query.Where(s => s.CleanTitle == cleanTitle || s.Title == title)
.AndWhere(s => s.ArtistId == artistId)
.FirstOrDefault();
}
public Album FindByTitleInexact(int artistId, string title)
{
double fuzzThreshold = 0.7;
double fuzzGap = 0.4;
var cleanTitle = Parser.Parser.CleanArtistName(title);
if (string.IsNullOrEmpty(cleanTitle))
cleanTitle = title;
var sortedAlbums = Query.Where(s => s.ArtistId == artistId)
.Select(s => new
{
MatchProb = s.CleanTitle.FuzzyMatch(cleanTitle),
Album = s
})
.ToList()
.OrderByDescending(s => s.MatchProb)
.ToList();
if (!sortedAlbums.Any())
return null;
_logger.Trace("\nFuzzy album match on '{0}':\n{1}",
cleanTitle,
string.Join("\n", sortedAlbums.Select(x => $"{x.Album.CleanTitle}: {x.MatchProb}")));
if (sortedAlbums[0].MatchProb > fuzzThreshold
&& (sortedAlbums.Count == 1 || sortedAlbums[0].MatchProb - sortedAlbums[1].MatchProb > fuzzGap))
return sortedAlbums[0].Album;
return null;
}
public Album FindByArtistAndName(string artistName, string cleanTitle)
{
var cleanArtistName = Parser.Parser.CleanArtistName(artistName);

@ -17,7 +17,7 @@ namespace NzbDrone.Core.Music
List<Album> AddAlbums(List<Album> newAlbums);
Album FindById(string spotifyId);
Album FindByTitle(int artistId, string title);
Album FindByTitleInexact(string title);
Album FindByTitleInexact(int artistId, string title);
void DeleteAlbum(int albumId, bool deleteFiles);
List<Album> GetAllAlbums();
Album UpdateAlbum(Album album);
@ -87,9 +87,9 @@ namespace NzbDrone.Core.Music
return _albumRepository.FindByTitle(artistId, title);
}
public Album FindByTitleInexact(string title)
public Album FindByTitleInexact(int artistId, string title)
{
throw new NotImplementedException();
return _albumRepository.FindByTitleInexact(artistId, title);
}
public List<Album> GetAllAlbums()

@ -5,6 +5,7 @@ using NzbDrone.Core.MediaFiles;
using NzbDrone.Core.MediaFiles.Events;
using NzbDrone.Core.Messaging.Events;
using NzbDrone.Core.Music.Events;
using NzbDrone.Common.Extensions;
using System;
using System.Collections.Generic;
using System.Linq;
@ -18,6 +19,7 @@ namespace NzbDrone.Core.Music
List<Track> GetTracks(IEnumerable<int> ids);
Track FindTrack(int artistId, int albumId, int mediumNumber, int trackNumber);
Track FindTrackByTitle(int artistId, int albumId, int mediumNumber, int trackNumber, string releaseTitle);
Track FindTrackByTitleInexact(int artistId, int albumId, int mediumNumber, int trackNumber, string releaseTitle);
List<Track> GetTracksByArtist(int artistId);
List<Track> GetTracksByAlbum(int albumId);
//List<Track> GetTracksByAlbumTitle(string artistId, string albumTitle);
@ -100,6 +102,43 @@ namespace NzbDrone.Core.Music
return matches.OrderByDescending(e => e.NormalizedLength).FirstOrDefault()?.Track;
}
public Track FindTrackByTitleInexact(int artistId, int albumId, int mediumNumber, int trackNumber, string releaseTitle)
{
double fuzzThreshold = 0.6;
double fuzzGap = 0.2;
var normalizedReleaseTitle = Parser.Parser.NormalizeTrackTitle(releaseTitle).Replace(".", " ");
var tracks = _trackRepository.GetTracksByMedium(albumId, mediumNumber);
var matches = from track in tracks
let normalizedTitle = Parser.Parser.NormalizeTrackTitle(track.Title).Replace(".", " ")
let matchProb = normalizedTitle.FuzzyMatch(normalizedReleaseTitle)
where track.Title.Length > 0
orderby matchProb descending
select new
{
MatchProb = matchProb,
NormalizedTitle = normalizedTitle,
Track = track
};
var matchList = matches.ToList();
if (!matchList.Any())
return null;
_logger.Trace("\nFuzzy track match on '{0}':\n{1}",
normalizedReleaseTitle,
string.Join("\n", matchList.Select(x => $"{x.NormalizedTitle}: {x.MatchProb}")));
if (matchList[0].MatchProb > fuzzThreshold
&& (matchList.Count == 1 || matchList[0].MatchProb - matchList[1].MatchProb > fuzzGap)
&& (trackNumber == 0 || matchList[0].Track.AbsoluteTrackNumber == trackNumber))
return matchList[0].Track;
return null;
}
public List<Track> TracksWithFiles(int artistId)
{
return _trackRepository.TracksWithFiles(artistId);

@ -290,6 +290,12 @@ namespace NzbDrone.Core.Parser
album = _albumService.FindByTitle(artist.Id, cleanAlbumTitle);
}
if (album == null)
{
_logger.Debug("Trying inexact album match for {0}", parsedTrackInfo);
album = _albumService.FindByTitleInexact(artist.Id, cleanAlbumTitle);
}
if (album == null)
{
_logger.Debug("Parsed album title not found in Db for {0}", parsedTrackInfo);
@ -318,6 +324,12 @@ namespace NzbDrone.Core.Parser
trackInfo = _trackService.FindTrackByTitle(artist.Id, album.Id, parsedTrackInfo.DiscNumber, parsedTrackInfo.TrackNumbers.FirstOrDefault(), parsedTrackInfo.Title);
}
if (trackInfo == null)
{
_logger.Debug("Trying inexact track match for {0}", parsedTrackInfo);
trackInfo = _trackService.FindTrackByTitleInexact(artist.Id, album.Id, parsedTrackInfo.DiscNumber, parsedTrackInfo.TrackNumbers.FirstOrDefault(), cleanTrackTitle);
}
if (trackInfo != null)
{
_logger.Debug("Track {0} selected for {1}", trackInfo, parsedTrackInfo);

Loading…
Cancel
Save