Fixed: Improve fuzzy matching algorithm to match around word boundaries

Fixes READARR-C1
pull/1504/head
ta264 2 years ago
parent ecf1e1a130
commit c0e193dd1f

@ -1,4 +1,5 @@
using FluentAssertions;
using System.Collections.Generic;
using FluentAssertions;
using NUnit.Framework;
using NzbDrone.Common.Extensions;
using NzbDrone.Test.Common;
@ -57,5 +58,23 @@ namespace NzbDrone.Common.Test
{
text.FuzzyContains(pattern).Should().BeApproximately(expectedScore, 1e-9);
}
[TestCase("The quick brown fox jumps over the lazy dog", "ovr", " ", "over")]
[TestCase("The quick brown fox jumps over the lazy dog", "eover", " ", "over")]
[TestCase("The quick brown fox jumps over the lazy dog", "jmps over", " ", "jumps over")]
[TestCase("The quick brown fox jumps over the lazy dog", "jmps ovr", " ", "jumps over")]
[TestCase("The quick brown fox jumps over the lazy dog", "jumpss oveor", " ", "jumps over")]
[TestCase("The quick brown fox jumps over the lazy dog", "jummps ovver", " ", "jumps over")]
[TestCase("The quick brown fox jumps over the lazy dog", "hhumps over", " ", "jumps over")]
[TestCase("The quick brown fox jumps over the lazy dog", "hhumps ov", " ", "jumps over")]
[TestCase("The quick brown fox jumps over the lazy dog", "jumps ovea", " ", "jumps over")]
public void should_match_on_word_boundaries(string text, string pattern, string delimiters, string expected)
{
var match = text.FuzzyMatch(pattern, wordDelimiters: new HashSet<char>(delimiters));
var result = match.Item1 != -1 ? text.Substring(match.Item1, match.Item2) : "";
result.Should().Be(expected);
}
}
}

@ -1,4 +1,4 @@
/*
/*
* This file incorporates work covered by the following copyright and
* permission notice:
*
@ -21,6 +21,7 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Numerics;
namespace NzbDrone.Common.Extensions
@ -35,7 +36,7 @@ namespace NzbDrone.Common.Extensions
// return the accuracy of the best match of pattern within text
public static double FuzzyContains(this string text, string pattern)
{
return FuzzyMatch(text, pattern, 0.25).Item2;
return FuzzyMatch(text, pattern, 0.25).Item3;
}
/**
@ -45,37 +46,37 @@ namespace NzbDrone.Common.Extensions
* @param pattern The pattern to search for.
* @return Best match index or -1.
*/
public static Tuple<int, double> FuzzyMatch(this string text, string pattern, double matchThreshold = 0.5)
public static Tuple<int, int, double> FuzzyMatch(this string text, string pattern, double matchThreshold = 0.5, HashSet<char> wordDelimiters = null)
{
// Check for null inputs not needed since null can't be passed in C#.
if (text.Length == 0 || pattern.Length == 0)
{
// Nothing to match.
return new Tuple<int, double>(-1, 0);
return new Tuple<int, int, double>(-1, 0, 0);
}
if (pattern.Length <= text.Length)
if (pattern.Length <= text.Length && wordDelimiters == null)
{
var loc = text.IndexOf(pattern, StringComparison.Ordinal);
if (loc != -1)
{
// Perfect match!
return new Tuple<int, double>(loc, 1);
return new Tuple<int, int, double>(loc, pattern.Length, 1);
}
}
// Do a fuzzy compare.
if (pattern.Length < 32)
{
return MatchBitap(text, pattern, matchThreshold, new IntCalculator());
return MatchBitap(text, pattern, matchThreshold, new IntCalculator(), wordDelimiters);
}
if (pattern.Length < 64)
{
return MatchBitap(text, pattern, matchThreshold, new LongCalculator());
return MatchBitap(text, pattern, matchThreshold, new LongCalculator(), wordDelimiters);
}
return MatchBitap(text, pattern, matchThreshold, new BigIntCalculator());
return MatchBitap(text, pattern, matchThreshold, new BigIntCalculator(), wordDelimiters);
}
/**
@ -85,7 +86,7 @@ namespace NzbDrone.Common.Extensions
* @param pattern The pattern to search for.
* @return Best match index or -1.
*/
private static Tuple<int, double> MatchBitap<T>(string text, string pattern, double matchThreshold, Calculator<T> calculator)
private static Tuple<int, int, double> MatchBitap<T>(string text, string pattern, double matchThreshold, Calculator<T> calculator, HashSet<char> wordDelimiters = null)
{
// Initialise the alphabet.
var s = Alphabet(pattern, calculator);
@ -96,8 +97,11 @@ namespace NzbDrone.Common.Extensions
// Initialise the bit arrays.
var matchmask = calculator.LeftShift(calculator.One, pattern.Length - 1);
var bestLoc = -1;
var bestLength = 0;
var lastRd = Array.Empty<T>();
var lastMd = Array.Empty<List<int>>();
for (var d = 0; d < pattern.Length; d++)
{
// Scan for the best match; each iteration allows for one more error.
@ -106,42 +110,117 @@ namespace NzbDrone.Common.Extensions
var rd = new T[finish + 2];
rd[finish + 1] = calculator.Subtract(calculator.LeftShift(calculator.One, d), calculator.One);
var md = new List<int>[finish + 2];
md[finish + 1] = new List<int>();
for (var j = finish; j >= start; j--)
{
T charMatch;
if (text.Length <= j - 1 || !s.ContainsKey(text[j - 1]))
T rd_exact, rd_last, rd_curr, rd_a, rd_b;
List<int> md_exact, md_last, md_curr, md_a, md_b;
if (text.Length <= j - 1 || !s.TryGetValue(text[j - 1], out charMatch))
{
// Out of range.
charMatch = calculator.Zero;
}
else
{
charMatch = s[text[j - 1]];
}
if (d == 0)
{
// First pass: exact match.
rd[j] = calculator.BitwiseAnd(calculator.BitwiseOr(calculator.LeftShift(rd[j + 1], 1), calculator.One), charMatch);
if (wordDelimiters != null)
{
if (calculator.NotEqual(rd[j], calculator.Zero))
{
md[j] = md[j + 1].Any() ? md[j + 1].SelectList(x => x + 1) : new List<int> { 1 };
}
else
{
md[j] = new List<int>();
}
}
}
else
{
// Subsequent passes: fuzzy match.
rd[j] = calculator.BitwiseOr(calculator.BitwiseAnd(calculator.BitwiseOr(calculator.LeftShift(rd[j + 1], 1), calculator.One), charMatch),
calculator.BitwiseOr(calculator.BitwiseOr(calculator.LeftShift(calculator.BitwiseOr(lastRd[j + 1], lastRd[j]), 1), calculator.One), lastRd[j + 1]));
// state if we assume exact match on char j
rd_exact = calculator.BitwiseAnd(calculator.BitwiseOr(calculator.LeftShift(rd[j + 1], 1), calculator.One), charMatch);
// state if we assume substitution on char j
rd_a = calculator.LeftShift(lastRd[j + 1], 1);
// state if we assume deletion on char j
rd_b = calculator.LeftShift(lastRd[j], 1);
// state if we assume insertion at char j
rd_last = lastRd[j + 1];
// the final state for this pass
rd_curr = calculator.BitwiseOr(rd_exact,
calculator.BitwiseOr(rd_a,
calculator.BitwiseOr(rd_b,
calculator.BitwiseOr(calculator.One,
rd_last))));
rd[j] = rd_curr;
if (wordDelimiters != null)
{
// exact match
if (calculator.NotEqual(rd_exact, calculator.Zero))
{
md_exact = md[j + 1].Any() ? md[j + 1].SelectList(x => x + 1) : new List<int> { 1 };
}
else
{
md_exact = new List<int>();
}
// substitution
md_a = lastMd[j + 1].Any() ? lastMd[j + 1].SelectList(x => x + 1) : new List<int> { 1 };
// deletion
md_b = lastMd[j].Any() ? lastMd[j] : new List<int> { 1 };
// insertion
md_last = lastMd[j].Any() ? lastMd[j + 1].SelectList(x => x + 1) : new List<int> { 1 };
// combined
md_curr = md_exact.Concat(md_a).Concat(md_b).Concat(md_last).Distinct().ToList();
md[j] = md_curr;
}
}
if (calculator.NotEqual(calculator.BitwiseAnd(rd[j], matchmask), calculator.Zero))
{
var score = BitapScore(d, pattern);
// This match will almost certainly be better than any existing
// match. But check anyway.
if (score >= scoreThreshold)
var score = BitapScore(d, pattern);
bool isOnWordBoundary;
var endsOnWordBoundaryLength = 0;
if (wordDelimiters != null)
{
var startsOnWordBoundary = (j - 1 == 0 || wordDelimiters.Contains(text[j - 2])) && !wordDelimiters.Contains(text[j - 1]);
endsOnWordBoundaryLength = md[j].FirstOrDefault(x => (j + x >= text.Length || wordDelimiters.Contains(text[j - 1 + x])) && !wordDelimiters.Contains(text[j - 1]));
isOnWordBoundary = startsOnWordBoundary && endsOnWordBoundaryLength > 0;
}
else
{
isOnWordBoundary = true;
}
if (score >= scoreThreshold && isOnWordBoundary)
{
// Told you so.
scoreThreshold = score;
bestLoc = j - 1;
bestLength = endsOnWordBoundaryLength;
}
}
}
@ -153,9 +232,10 @@ namespace NzbDrone.Common.Extensions
}
lastRd = rd;
lastMd = md;
}
return new Tuple<int, double>(bestLoc, scoreThreshold);
return new Tuple<int, int, double>(bestLoc, bestLength, scoreThreshold);
}
/**

@ -220,14 +220,30 @@ namespace NzbDrone.Core.Test.ParserTests
parseResult.DiscographyEnd.Should().Be(endyear);
}
[Test]
public void should_not_parse_author_name_and_book_title_by_incorrect_search_criteria()
[TestCase("Abba", "Abba", "Black Sabbath Black Sabbath FLAC")]
[TestCase("Anthony Horowitz", "Oblivion", "The Elder Scrolls IV Oblivion+Expansions")]
[TestCase("Danielle Steel", "Zoya", "DanielleSteelZoya.zip")]
[TestCase("Stephen King", "It", "Stephen Kingston - Spirit Doll (retail) (azw3)")]
[TestCase("Stephen King", "It", "Stephen_Cleobury-The_Music_of_Kings_Choral_Favourites_from_Cambridge-WEB-2019-ENRiCH")]
[TestCase("Stephen King", "Guns", "Stephen King - The Gunslinger: Dark Tower 1 MP3")]
[TestCase("Rick Riordan", "An Interview with Rick Riordan", "AnInterviewwithRickRiordan_ep6")]
public void should_not_parse_author_name_and_book_title_by_incorrect_search_criteria(string searchAuthor, string searchBook, string report)
{
GivenSearchCriteria("Abba", "Abba");
var parseResult = Parser.Parser.ParseBookTitleWithSearchCriteria("Black Sabbath Black Sabbath FLAC", _author, _books);
GivenSearchCriteria(searchAuthor, searchBook);
var parseResult = Parser.Parser.ParseBookTitleWithSearchCriteria(report, _author, _books);
parseResult.Should().BeNull();
}
[TestCase("James Herbert", "48", "James Hertbert Collection/'48 - James Herbert (epub)", "James Herbert", "48")]
public void should_parse_with_search_criteria(string searchAuthor, string searchBook, string report, string expectedAuthor, string expectedBook)
{
GivenSearchCriteria(searchAuthor, searchBook);
var parseResult = Parser.Parser.ParseBookTitleWithSearchCriteria(report, _author, _books);
parseResult.AuthorName.Should().Be(expectedAuthor);
parseResult.BookTitle.Should().Be(expectedBook);
}
[TestCase("Ed Sheeran", "I See Fire", "Ed Sheeran I See Fire[Mimp3.eu].mp3 FLAC")]
[TestCase("Ed Sheeran", "Divide", "Ed Sheeran ? Divide FLAC")]
[TestCase("Ed Sheeran", "+", "Ed Sheeran + FLAC")]

@ -203,6 +203,7 @@ namespace NzbDrone.Core.Parser
private static readonly Regex YearInTitleRegex = new Regex(@"^(?<title>.+?)(?:\W|_)?(?<year>\d{4})",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly HashSet<char> WordDelimiters = new HashSet<char>(" .,_-=()[]|\"`'");
private static readonly Regex WordDelimiterRegex = new Regex(@"(\s|\.|,|_|-|=|\(|\)|\[|\]|\|)+", RegexOptions.Compiled);
private static readonly Regex PunctuationRegex = new Regex(@"[^\w\s]", RegexOptions.Compiled);
private static readonly Regex CommonWordRegex = new Regex(@"\b(a|an|the|and|or|of)\b\s?", RegexOptions.IgnoreCase | RegexOptions.Compiled);
@ -352,7 +353,7 @@ namespace NzbDrone.Core.Parser
simpleTitle = CleanTorrentSuffixRegex.Replace(simpleTitle);
var bestBook = books
.OrderByDescending(x => simpleTitle.FuzzyContains(x.Editions.Value.Single(x => x.Monitored).Title))
.OrderByDescending(x => simpleTitle.FuzzyMatch(x.Editions.Value.Single(x => x.Monitored).Title, wordDelimiters: WordDelimiters))
.First()
.Editions.Value
.Single(x => x.Monitored);
@ -419,69 +420,18 @@ namespace NzbDrone.Core.Parser
Logger.Trace($"Finding '{name}' in '{report}'");
var (locStart, score) = report.ToLowerInvariant().FuzzyMatch(name.ToLowerInvariant(), 0.6);
var (locStart, matchLength, score) = report.ToLowerInvariant().FuzzyMatch(name.ToLowerInvariant(), 0.6, WordDelimiters);
if (locStart == -1)
{
return null;
}
var diff = (int)Math.Round((1.0 - score) * name.Length, 0);
var length = Math.Min(name.Length + diff, report.Length - locStart);
var found = report.Substring(locStart, matchLength);
var reportReversed = new string(report.Substring(locStart, length).ToLowerInvariant().Reverse().ToArray());
var nameReversed = new string(name.ToLowerInvariant().Reverse().ToArray());
var locEnd = locStart + reportReversed.Length - reportReversed.FuzzyFind(nameReversed, 0.6);
var boundaries = WordDelimiterRegex.Matches(report);
if (boundaries.Count == 0)
{
return null;
}
var starts = new List<int>();
var finishes = new List<int>();
if (boundaries[0].Index == 0)
{
starts.Add(boundaries[0].Length);
}
else
{
starts.Add(0);
}
foreach (Match match in boundaries)
{
var start = match.Index + match.Length;
if (start < report.Length)
{
starts.Add(start);
}
var finish = match.Index - 1;
if (finish >= 0)
{
finishes.Add(finish);
}
}
var lastMatch = boundaries[boundaries.Count - 1];
if (lastMatch.Index + lastMatch.Length < report.Length)
{
finishes.Add(report.Length - 1);
}
var wordStart = starts.OrderBy(x => Math.Abs(x - locStart)).First();
var wordEnd = finishes.OrderBy(x => Math.Abs(x - locEnd)).First();
var found = report.Substring(wordStart, wordEnd - wordStart + 1);
if (found.ToLowerInvariant().FuzzyMatch(name.ToLowerInvariant()) >= 0.8)
if (score >= 0.8)
{
remainder = report.Remove(wordStart, wordEnd - wordStart + 1);
remainder = report.Remove(locStart, matchLength);
return found.Replace('.', ' ').Replace('_', ' ');
}

@ -219,7 +219,7 @@ namespace NzbDrone.Core.Parser
foreach (var book in possibleBooks)
{
var bookMatch = title.FuzzyMatch(book.Title, 0.5);
var score = (authorMatch.Item2 + bookMatch.Item2) / 2;
var score = (authorMatch.Item3 + bookMatch.Item3) / 2;
_logger.Trace($"Book {book} has score {score}");
@ -234,7 +234,7 @@ namespace NzbDrone.Core.Parser
foreach (var edition in possibleEditions)
{
var editionMatch = title.FuzzyMatch(edition.Title, 0.5);
var score = (authorMatch.Item2 + editionMatch.Item2) / 2;
var score = (authorMatch.Item3 + editionMatch.Item3) / 2;
_logger.Trace($"Edition {edition} has score {score}");

Loading…
Cancel
Save