From 2a4f681b1791883ec064fb5463d1d5ebd352df50 Mon Sep 17 00:00:00 2001 From: ta264 Date: Wed, 19 Jan 2022 21:38:37 +0000 Subject: [PATCH] Fixed: Speed up RSS sync --- .../LevenshteinDistanceFixture.cs | 41 +- .../Extensions/BerghelRoach.cs | 478 ++++++++++++++++++ .../Extensions/FuzzyContains.cs | 132 +++-- .../Extensions/LevenstheinExtensions.cs | 61 --- .../SearchArtistComparerFixture.cs | 71 --- .../FindByNameInexactFixture.cs | 18 +- .../Books/Services/AuthorService.cs | 18 +- .../MetadataSource/SearchAuthorComparer.cs | 95 ---- 8 files changed, 595 insertions(+), 319 deletions(-) create mode 100644 src/NzbDrone.Common/Extensions/BerghelRoach.cs delete mode 100644 src/NzbDrone.Common/Extensions/LevenstheinExtensions.cs delete mode 100644 src/NzbDrone.Core.Test/MetadataSource/SearchArtistComparerFixture.cs delete mode 100644 src/NzbDrone.Core/MetadataSource/SearchAuthorComparer.cs diff --git a/src/NzbDrone.Common.Test/LevenshteinDistanceFixture.cs b/src/NzbDrone.Common.Test/LevenshteinDistanceFixture.cs index aff7e9738..5916b6c50 100644 --- a/src/NzbDrone.Common.Test/LevenshteinDistanceFixture.cs +++ b/src/NzbDrone.Common.Test/LevenshteinDistanceFixture.cs @@ -1,4 +1,4 @@ -using FluentAssertions; +using FluentAssertions; using NUnit.Framework; using NzbDrone.Common.Extensions; using NzbDrone.Test.Common; @@ -25,24 +25,6 @@ namespace NzbDrone.Common.Test text.LevenshteinDistance(other).Should().Be(expected); } - [TestCase("", "", 0)] - [TestCase("abc", "abc", 0)] - [TestCase("abc", "abcd", 1)] - [TestCase("abcd", "abc", 3)] - [TestCase("abc", "abd", 3)] - [TestCase("abc", "adc", 3)] - [TestCase("abcdefgh", "abcghdef", 8)] - [TestCase("a.b.c.", "abc", 0)] - [TestCase("Agents of shield", "Marvel's Agents Of S.H.I.E.L.D.", 9)] - [TestCase("Agents of shield", "Agents of cracked", 14)] - [TestCase("Agents of shield", "the shield", 24)] - [TestCase("ABCxxx", "ABC1xx", 3)] - [TestCase("ABC1xx", "ABCxxx", 3)] - public void LevenshteinDistanceClean(string text, string other, int expected) - { - text.ToLower().LevenshteinDistanceClean(other.ToLower()).Should().Be(expected); - } - [TestCase("hello", "hello")] [TestCase("hello", "bye")] [TestCase("a longer string", "a different long string")] @@ -58,5 +40,26 @@ namespace NzbDrone.Common.Test { a.FuzzyMatch(b).Should().Be(expected); } + + [TestCase("AVERY", "GARVEY", 3)] + [TestCase("ADCROFT", "ADDESSI", 5)] + [TestCase("BAIRD", "BAISDEN", 3)] + [TestCase("BOGGAN", "BOGGS", 2)] + [TestCase("CLAYTON", "CLEARY", 5)] + [TestCase("DYBAS", "DYCKMAN", 4)] + [TestCase("EMINETH", "EMMERT", 4)] + [TestCase("GALANTE", "GALICKI", 4)] + [TestCase("HARDIN", "HARDING", 1)] + [TestCase("KEHOE", "KEHR", 2)] + [TestCase("LOWRY", "LUBARSKY", 5)] + [TestCase("MAGALLAN", "MAGANA", 3)] + [TestCase("MAYO", "MAYS", 1)] + [TestCase("MOENY", "MOFFETT", 4)] + [TestCase("PARE", "PARENT", 2)] + [TestCase("RAMEY", "RAMFREY", 2)] + public void BMtest(string a, string b, int expected) + { + ModifiedBerghelRoachEditDistance.GetDistance(a, b, 10).Should().Be(expected); + } } } diff --git a/src/NzbDrone.Common/Extensions/BerghelRoach.cs b/src/NzbDrone.Common/Extensions/BerghelRoach.cs new file mode 100644 index 000000000..1e2e4a657 --- /dev/null +++ b/src/NzbDrone.Common/Extensions/BerghelRoach.cs @@ -0,0 +1,478 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +using System; +using System.Diagnostics; + +namespace NzbDrone.Common.Extensions +{ + /** + * A modified version of a string edit distance described by Berghel and + * Roach that uses only O(d) space and O(n*d) worst-case time, where n is + * the pattern string length and d is the edit distance computed. + * We achieve the space reduction by keeping only those sub-computations + * required to compute edit distance, giving up the ability to + * reconstruct the edit path. + */ + public static class ModifiedBerghelRoachEditDistance + { + /* + * This is a modification of the original Berghel-Roach edit + * distance (based on prior work by Ukkonen) described in + * ACM Transactions on Information Systems, Vol. 14, No. 1, + * January 1996, pages 94-106. + * + * I observed that only O(d) prior computations are required + * to compute edit distance. Rather than keeping all prior + * f(k,p) results in a matrix, we keep only the two "outer edges" + * in the triangular computation pattern that will be used in + * subsequent rounds. We cannot reconstruct the edit path, + * but many applications do not require that; for them, this + * modification uses less space (and empirically, slightly + * less time). + * + * First, some history behind the algorithm necessary to understand + * Berghel-Roach and our modification... + * + * The traditional algorithm for edit distance uses dynamic programming, + * building a matrix of distances for substrings: + * D[i,j] holds the distance for string1[0..i]=>string2[0..j]. + * The matrix is initially populated with the trivial values + * D[0,j]=j and D[i,0]=i; and then expanded with the rule: + *
+         *    D[i,j] = min( D[i-1,j]+1,       // insertion
+         *                  D[i,j-1]+1,       // deletion
+         *                  (D[i-1,j-1]
+         *                   + (string1[i]==string2[j])
+         *                      ? 0           // match
+         *                      : 1           // substitution ) )
+         * 
+ * + * Ukkonen observed that each diagonal of the matrix must increase + * by either 0 or 1 from row to row. If D[i,j] = p, then the + * matching rule requires that D[i+x,j+x] = p for all x + * where string1[i..i+x) matches string2[j..j+j+x). Ukkonen + * defined a function f(k,p) as the highest row number in which p + * appears on the k-th diagonal (those D[i,j] where k=(i-j), noting + * that k may be negative). The final result of the edit + * distance is the D[n,m] cell, on the (n-m) diagonal; it is + * the value of p for which f(n-m, p) = m. The function f can + * also be computed dynamically, according to a simple recursion: + *
+         *    f(k,p) {
+         *      contains_p = max(f(k-1,p-1), f(k,p-1)+1, f(k+1,p-1)+1)
+         *      while (string1[contains_p] == string2[contains_p + k])
+         *        contains_p++;
+         *      return contains_p;
+         *    }
+         * 
+ * The max() expression finds a row where the k-th diagonal must + * contain p by virtue of an edit from the prior, same, or following + * diagonal (corresponding to an insert, substitute, or delete); + * we need not consider more distant diagonals because row-to-row + * and column-to-column changes are at most +/- 1. + * + * The original Ukkonen algorithm computed f(k,p) roughly as + * follows: + *
+         *    for (p = 0; ; p++) {
+         *      compute f(k,p) for all valid k
+         *      if (f(n-m, p) == m) return p;
+         *    }
+         * 
+ * + * Berghel and Roach observed that many values of f(k,p) are + * computed unnecessarily, and reorganized the computation into + * a just-in-time sequence. In each iteration, we are primarily + * interested in the terminating value f(main,p), where main=(n-m) + * is the main diagonal. To compute that we need f(x,p-1) for + * three values of x: main-1, main, and main+1. Those depend on + * values for p-2, and so forth. We will already have computed + * f(main,p-1) in the prior round, and thus f(main-1,p-2) and + * f(main+1,p-2), and so forth. The only new values we need to compute + * are on the edges: f(main-i,p-i) and f(main+i,p-i). Noting that + * f(k,p) is only meaningful when abs(k) is no greater than p, + * one of the Berghel-Roach reviewers noted that we can compute + * the bounds for i: + *
+         *    (main+i &le p-i) implies (i ≤ (p-main)/2)
+         * 
+ * (where main+i is limited on the positive side) and similarly + *
+         *    (-(main-i) &le p-i) implies (i ≤ (p+main)/2).
+         * 
+ * (where main-i is limited on the negative side). + * + * This reduces the computation sequence to + *
+         *   for (i = (p-main)/2; i > 0; i--) compute f(main+i,p-i);
+         *   for (i = (p+main)/2; i > 0; i--) compute f(main-i,p-i);
+         *   if (f(main, p) == m) return p;
+         * 
+ * + * The original Berghel-Roach algorithm recorded prior values + * of f(k,p) in a matrix, using O(distance^2) space, enabling + * reconstruction of the edit path, but if all we want is the + * edit *distance*, we only need to keep O(distance) prior computations. + * + * The requisite prior k-1, k, and k+1 values are conveniently + * computed in the current round and the two preceding it. + * For example, on the higher-diagonal side, we compute: + *
+         *    current[i] = f(main+i, p-i)
+         * 
+ * We keep the two prior rounds of results, where p was one and two + * smaller. So, from the preceidng round + *
+         *    last[i] = f(main+i, (p-1)-i)
+         * 
+ * and from the prior round, but one position back: + *
+         *    prior[i-1] = f(main+(i-1), (p-2)-(i-1))
+         * 
+ * In the current round, one iteration earlier: + *
+         *    current[i+1] = f(main+(i+1), p-(i+1))
+         * 
+ * Note that the distance in all of these evaluates to p-i-1, + * and the diagonals are (main+i) and its neighbors... just + * what we need. The lower-diagonal side behaves similarly. + * + * We need to materialize values that are not computed in prior + * rounds, for either of two reasons: + * In all of these cases, the missing f(k,p) values are for abs(k) > p, + * where a real value of f(k,p) is undefined. [The original Berghel-Roach + * algorithm prefills its F matrix with these values, but we fill + * them as we go, as needed.] We define + *
+         *    f(-p-1,p) = p, so that we start diagonal -p with row p,
+         *    f(p+1,p) = -1, so that we start diagonal p with row 0.
+         * 
+ * (We also allow f(p+2,p)=f(-p-2,p)=-1, causing those values to + * have no effect in the starting row computation.] + * + * We only expand the set of diagonals visited every other round, + * when (p-main) or (p+main) is even. We keep track of even/oddness + * to save some arithmetic. The first round is always even, as p=abs(main). + * Note that we rename the "f" function to "computeRow" to be Googley. + */ + + public static int LevenshteinDistance(this string text, string other) + { + return ModifiedBerghelRoachEditDistance.GetDistance(text, other); + } + + public static int GetDistance(string target, string pattern, int limit = 20) + { + return GetDistance(target.ToCharArray(), pattern.ToCharArray(), limit); + } + + public static int GetDistance(char[] target, char[] pattern, int limit = 20) + { + var currentLeft = new int[limit]; + + var currentRight = new int[limit]; + + var lastLeft = new int[limit]; + + var lastRight = new int[limit]; + + var priorLeft = new int[limit]; + + var priorRight = new int[limit]; + + var targetLength = target.Length; + + /* + * Compute the main diagonal number. + * The final result lies on this diagonal. + */ + var main = pattern.Length - targetLength; + + /* + * Compute our initial distance candidate. + * The result cannot be less than the difference in + * string lengths, so we start there. + */ + var distance = Math.Abs(main); + if (distance > limit) + { + /* More than we wanted. Give up right away */ + return int.MaxValue; + } + + /* + * In the main loop below, the current{Right,Left} arrays record results + * from the current outer loop pass. The last{Right,Left} and + * prior{Right,Left} arrays hold the results from the preceding two passes. + * At the end of the outer loop, we shift them around (reusing the prior + * array as the current for the next round, to avoid reallocating). + * The Right reflects higher-numbered diagonals, Left lower-numbered. + */ + + /* + * Fill in "prior" values for the first two passes through + * the distance loop. Note that we will execute only one side of + * the main diagonal in these passes, so we only need + * initialize one side of prior values. + */ + + if (main <= 0) + { + EnsureCapacityRight(ref currentRight, ref lastRight, ref priorRight, distance, false); + for (var j = 0; j <= distance; j++) + { + lastRight[j] = distance - j - 1; /* Make diagonal -k start in row k */ + priorRight[j] = -1; + } + } + else + { + EnsureCapacityLeft(ref currentLeft, ref lastLeft, ref priorLeft, distance, false); + for (var j = 0; j <= distance; j++) + { + lastLeft[j] = -1; /* Make diagonal +k start in row 0 */ + priorLeft[j] = -1; + } + } + + /* + * Keep track of even rounds. Only those rounds consider new diagonals, + * and thus only they require artificial "last" values below. + */ + var even = true; + + /* + * MAIN LOOP: try each successive possible distance until one succeeds. + */ + while (true) + { + /* + * Before calling computeRow(main, distance), we need to fill in + * missing cache elements. See the high-level description above. + */ + + /* + * Higher-numbered diagonals + */ + + var offDiagonal = (distance - main) / 2; + EnsureCapacityRight(ref currentRight, ref lastRight, ref priorRight, offDiagonal, true); + + if (even) + { + /* Higher diagonals start at row 0 */ + lastRight[offDiagonal] = -1; + } + + var immediateRight = -1; + for (; offDiagonal > 0; offDiagonal--) + { + currentRight[offDiagonal] = immediateRight = ComputeRow( + main + offDiagonal, + distance - offDiagonal, + pattern, + target, + priorRight[offDiagonal - 1], + lastRight[offDiagonal], + immediateRight); + } + + /* + * Lower-numbered diagonals + */ + + offDiagonal = (distance + main) / 2; + EnsureCapacityLeft(ref currentLeft, ref lastLeft, ref priorLeft, offDiagonal, true); + + if (even) + { + /* Lower diagonals, fictitious values for f(-x-1,x) = x */ + lastLeft[offDiagonal] = ((distance - main) / 2) - 1; + } + + var immediateLeft = even ? -1 : (distance - main) / 2; + + for (; offDiagonal > 0; offDiagonal--) + { + currentLeft[offDiagonal] = immediateLeft = ComputeRow( + main - offDiagonal, + distance - offDiagonal, + pattern, + target, + immediateLeft, + lastLeft[offDiagonal], + priorLeft[offDiagonal - 1]); + } + + /* + * We are done if the main diagonal has distance in the last row. + */ + var mainRow = ComputeRow(main, distance, pattern, target, immediateLeft, lastLeft[0], immediateRight); + + if ((mainRow == targetLength) || (++distance > limit) || (distance < 0)) + { + break; + } + + /* The [0] element goes to both sides. */ + currentLeft[0] = currentRight[0] = mainRow; + + /* Rotate rows around for next round: current=>last=>prior (=>current) */ + var tmp = priorLeft; + priorLeft = lastLeft; + lastLeft = currentLeft; + currentLeft = priorLeft; + + tmp = priorRight; + priorRight = lastRight; + lastRight = currentRight; + currentRight = tmp; + + /* Update evenness, too */ + even = !even; + } + + return distance; + } + + /** + * Computes the highest row in which the distance {@code p} appears + * in diagonal {@code k} of the edit distance computation for + * strings {@code a} and {@code b}. The diagonal number is + * represented by the difference in the indices for the two strings; + * it can range from {@code -b.length()} through {@code a.length()}. + * + * More precisely, this computes the highest value x such that + *
+ *     p = edit-distance(a[0:(x+k)), b[0:x)).
+ * 
+ * + * This is the "f" function described by Ukkonen. + * + * The caller must assure that abs(k) ≤ p, the only values for + * which this is well-defined. + * + * The implementation depends on the cached results of prior + * computeRow calls for diagonals k-1, k, and k+1 for distance p-1. + * These must be supplied in {@code knownLeft}, {@code knownAbove}, + * and {@code knownRight}, respectively. + * @param k diagonal number + * @param p edit distance + * @param a one string to be compared + * @param b other string to be compared + * @param knownLeft value of {@code computeRow(k-1, p-1, ...)} + * @param knownAbove value of {@code computeRow(k, p-1, ...)} + * @param knownRight value of {@code computeRow(k+1, p-1, ...)} + */ + private static int ComputeRow(int k, + int p, + char[] a, + char[] b, + int knownLeft, + int knownAbove, + int knownRight) + { + Debug.Assert(Math.Abs(k) <= p); + Debug.Assert(p >= 0); + + /* + * Compute our starting point using the recurrance. + * That is, find the first row where the desired edit distance + * appears in our diagonal. This is at least one past + * the highest row for + */ + int t; + if (p == 0) + { + t = 0; + } + else + { + /* + * We look at the adjacent diagonals for the next lower edit distance. + * We can start in the next row after the prior result from + * our own diagonal (the "substitute" case), or the next diagonal + * ("delete"), but only the same row as the prior result from + * the prior diagonal ("insert"). + */ + t = Math.Max(Math.Max(knownAbove, knownRight) + 1, knownLeft); + } + + /* + * Look down our diagonal for matches to find the maximum + * row with edit-distance p. + */ + var tmax = Math.Min(b.Length, a.Length - k); + + while ((t < tmax) && b[t] == a[t + k]) + { + t++; + } + + return t; + } + +/* + * Ensures that the Left arrays can be indexed through {@code index}, + * inclusively, resizing (and copying) as necessary. + */ + private static void EnsureCapacityLeft(ref int[] currentLeft, ref int[] lastLeft, ref int[] priorLeft, int index, bool copy) + { + if (currentLeft.Length <= index) + { + index++; + Resize(ref priorLeft, index, copy); + Resize(ref lastLeft, index, copy); + Resize(ref currentLeft, index, false); + } + } + +/* + * Ensures that the Right arrays can be indexed through {@code index}, + * inclusively, resizing (and copying) as necessary. + */ + private static void EnsureCapacityRight(ref int[] currentRight, ref int[] lastRight, ref int[] priorRight, int index, bool copy) + { + if (currentRight.Length <= index) + { + index++; + Resize(ref priorRight, index, copy); + Resize(ref lastRight, index, copy); + Resize(ref currentRight, index, false); + } + } + +/* Resize an array, copying old contents if requested */ + private static void Resize(ref int[] array, int size, bool copy) + { + if (copy) + { + Array.Resize(ref array, size); + } + else + { + array = new int[size]; + } + } + } +} diff --git a/src/NzbDrone.Common/Extensions/FuzzyContains.cs b/src/NzbDrone.Common/Extensions/FuzzyContains.cs index 7bbe75b20..6daf438a5 100644 --- a/src/NzbDrone.Common/Extensions/FuzzyContains.cs +++ b/src/NzbDrone.Common/Extensions/FuzzyContains.cs @@ -1,4 +1,4 @@ -/* +/* * This file incorporates work covered by the following copyright and * permission notice: * @@ -65,7 +65,17 @@ namespace NzbDrone.Common.Extensions } // Do a fuzzy compare. - return MatchBitap(text, pattern, matchThreshold); + if (pattern.Length < 32) + { + return MatchBitap(text, pattern, matchThreshold, new IntCalculator()); + } + + if (pattern.Length < 64) + { + return MatchBitap(text, pattern, matchThreshold, new LongCalculator()); + } + + return MatchBitap(text, pattern, matchThreshold, new BigIntCalculator()); } /** @@ -75,38 +85,34 @@ namespace NzbDrone.Common.Extensions * @param pattern The pattern to search for. * @return Best match index or -1. */ - private static Tuple MatchBitap(string text, string pattern, double matchThreshold) + private static Tuple MatchBitap(string text, string pattern, double matchThreshold, Calculator calculator) { // Initialise the alphabet. - Dictionary s = alphabet(pattern); - - // don't keep creating new BigInteger(1) - var big1 = new BigInteger(1); + var s = Alphabet(pattern, calculator); - // Lowest score belowe which we give up. - var score_threshold = matchThreshold; + // Lowest score below which we give up. + var scoreThreshold = matchThreshold; // Initialise the bit arrays. - var matchmask = big1 << (pattern.Length - 1); - int best_loc = -1; + var matchmask = calculator.LeftShift(calculator.One, pattern.Length - 1); + var bestLoc = -1; - // Empty initialization added to appease C# compiler. - var last_rd = new BigInteger[0]; - for (int d = 0; d < pattern.Length; d++) + var lastRd = Array.Empty(); + for (var d = 0; d < pattern.Length; d++) { // Scan for the best match; each iteration allows for one more error. - int start = 1; - int finish = text.Length + pattern.Length; + var start = 1; + var finish = text.Length + pattern.Length; - var rd = new BigInteger[finish + 2]; - rd[finish + 1] = (big1 << d) - big1; - for (int j = finish; j >= start; j--) + var rd = new T[finish + 2]; + rd[finish + 1] = calculator.Subtract(calculator.LeftShift(calculator.One, d), calculator.One); + for (var j = finish; j >= start; j--) { - BigInteger charMatch; + T charMatch; if (text.Length <= j - 1 || !s.ContainsKey(text[j - 1])) { // Out of range. - charMatch = 0; + charMatch = calculator.Zero; } else { @@ -116,40 +122,40 @@ namespace NzbDrone.Common.Extensions if (d == 0) { // First pass: exact match. - rd[j] = ((rd[j + 1] << 1) | big1) & charMatch; + rd[j] = calculator.BitwiseAnd(calculator.BitwiseOr(calculator.LeftShift(rd[j + 1], 1), calculator.One), charMatch); } else { // Subsequent passes: fuzzy match. - rd[j] = ((rd[j + 1] << 1) | big1) & charMatch - | (((last_rd[j + 1] | last_rd[j]) << 1) | big1) | last_rd[j + 1]; + rd[j] = calculator.BitwiseOr(calculator.BitwiseAnd(calculator.BitwiseOr(calculator.LeftShift(rd[j + 1], 1), calculator.One), charMatch), + calculator.BitwiseOr(calculator.BitwiseOr(calculator.LeftShift(calculator.BitwiseOr(lastRd[j + 1], lastRd[j]), 1), calculator.One), lastRd[j + 1])); } - if ((rd[j] & matchmask) != 0) + if (calculator.NotEqual(calculator.BitwiseAnd(rd[j], matchmask), calculator.Zero)) { - var score = bitapScore(d, pattern); + var score = BitapScore(d, pattern); // This match will almost certainly be better than any existing // match. But check anyway. - if (score >= score_threshold) + if (score >= scoreThreshold) { // Told you so. - score_threshold = score; - best_loc = j - 1; + scoreThreshold = score; + bestLoc = j - 1; } } } - if (bitapScore(d + 1, pattern) < score_threshold) + if (BitapScore(d + 1, pattern) < scoreThreshold) { // No hope for a (better) match at greater error levels. break; } - last_rd = rd; + lastRd = rd; } - return new Tuple(best_loc, score_threshold); + return new Tuple(bestLoc, scoreThreshold); } /** @@ -158,7 +164,7 @@ namespace NzbDrone.Common.Extensions * @param pattern Pattern being sought. * @return Overall score for match (1.0 = good, 0.0 = bad). */ - private static double bitapScore(int e, string pattern) + private static double BitapScore(int e, string pattern) { return 1.0 - ((double)e / pattern.Length); } @@ -168,26 +174,70 @@ namespace NzbDrone.Common.Extensions * @param pattern The text to encode. * @return Hash of character locations. */ - private static Dictionary alphabet(string pattern) + private static Dictionary Alphabet(string pattern, Calculator calculator) { - var s = new Dictionary(); - char[] char_pattern = pattern.ToCharArray(); - foreach (char c in char_pattern) + var s = new Dictionary(); + var charPattern = pattern.ToCharArray(); + foreach (var c in charPattern) { if (!s.ContainsKey(c)) { - s.Add(c, 0); + s.Add(c, calculator.Zero); } } - int i = 0; - foreach (char c in char_pattern) + var i = 0; + foreach (var c in charPattern) { - s[c] = s[c] | (new BigInteger(1) << (pattern.Length - i - 1)); + s[c] = calculator.BitwiseOr(s[c], calculator.LeftShift(calculator.One, pattern.Length - i - 1)); i++; } return s; } + + private abstract class Calculator + { + public abstract T Zero { get; } + public abstract T One { get; } + public abstract T Subtract(T a, T b); + public abstract T LeftShift(T a, int shift); + public abstract T BitwiseOr(T a, T b); + public abstract T BitwiseAnd(T a, T b); + public abstract bool NotEqual(T a, T b); + } + + private sealed class BigIntCalculator : Calculator + { + public override BigInteger Zero => new BigInteger(0); + public override BigInteger One => new BigInteger(1); + public override BigInteger Subtract(BigInteger a, BigInteger b) => a - b; + public override BigInteger LeftShift(BigInteger a, int shift) => a << shift; + public override BigInteger BitwiseOr(BigInteger a, BigInteger b) => a | b; + public override BigInteger BitwiseAnd(BigInteger a, BigInteger b) => a & b; + public override bool NotEqual(BigInteger a, BigInteger b) => a != b; + } + + private sealed class IntCalculator : Calculator + { + public override int Zero => 0; + public override int One => 1; + public override int Subtract(int a, int b) => a - b; + public override int LeftShift(int a, int shift) => a << shift; + public override int BitwiseOr(int a, int b) => a | b; + public override int BitwiseAnd(int a, int b) => a & b; + public override bool NotEqual(int a, int b) => a != b; + } + + private sealed class LongCalculator : Calculator + { + public override long Zero => 0; + public override long One => 1; + public override long Subtract(long a, long b) => a - b; + public override long LeftShift(long a, int shift) => a << shift; + public override long BitwiseOr(long a, long b) => a | b; + public override long BitwiseAnd(long a, long b) => a & b; + public override bool NotEqual(long a, long b) => a != b; + } } } diff --git a/src/NzbDrone.Common/Extensions/LevenstheinExtensions.cs b/src/NzbDrone.Common/Extensions/LevenstheinExtensions.cs deleted file mode 100644 index 825525457..000000000 --- a/src/NzbDrone.Common/Extensions/LevenstheinExtensions.cs +++ /dev/null @@ -1,61 +0,0 @@ -using System; - -namespace NzbDrone.Common.Extensions -{ - public static class LevenstheinExtensions - { - public static int LevenshteinDistance(this string text, string other, int costInsert = 1, int costDelete = 1, int costSubstitute = 1) - { - if (text == other) - { - return 0; - } - - if (text.Length == 0) - { - return other.Length * costInsert; - } - - if (other.Length == 0) - { - return text.Length * costDelete; - } - - int[] matrix = new int[other.Length + 1]; - - for (var i = 1; i < matrix.Length; i++) - { - matrix[i] = i * costInsert; - } - - for (var i = 0; i < text.Length; i++) - { - int topLeft = matrix[0]; - matrix[0] = matrix[0] + costDelete; - - for (var j = 0; j < other.Length; j++) - { - int top = matrix[j]; - int left = matrix[j + 1]; - - var sumIns = top + costInsert; - var sumDel = left + costDelete; - var sumSub = topLeft + (text[i] == other[j] ? 0 : costSubstitute); - - topLeft = matrix[j + 1]; - matrix[j + 1] = Math.Min(Math.Min(sumIns, sumDel), sumSub); - } - } - - return matrix[other.Length]; - } - - public static int LevenshteinDistanceClean(this string expected, string other) - { - expected = expected.ToLower().Replace(".", ""); - other = other.ToLower().Replace(".", ""); - - return expected.LevenshteinDistance(other, 1, 3, 3); - } - } -} diff --git a/src/NzbDrone.Core.Test/MetadataSource/SearchArtistComparerFixture.cs b/src/NzbDrone.Core.Test/MetadataSource/SearchArtistComparerFixture.cs deleted file mode 100644 index 9c662a8c2..000000000 --- a/src/NzbDrone.Core.Test/MetadataSource/SearchArtistComparerFixture.cs +++ /dev/null @@ -1,71 +0,0 @@ -using System.Collections.Generic; -using System.Linq; -using FluentAssertions; -using NUnit.Framework; -using NzbDrone.Core.Books; -using NzbDrone.Core.MetadataSource; -using NzbDrone.Core.Test.Framework; - -namespace NzbDrone.Core.Test.MetadataSource -{ - [TestFixture] - public class SearchAuthorComparerFixture : CoreTest - { - private List _author; - - [SetUp] - public void Setup() - { - _author = new List(); - } - - private void WithSeries(string name) - { - _author.Add(new Author { Name = name }); - } - - [Test] - public void should_prefer_the_walking_dead_over_talking_dead_when_searching_for_the_walking_dead() - { - WithSeries("Talking Dead"); - WithSeries("The Walking Dead"); - - _author.Sort(new SearchAuthorComparer("the walking dead")); - - _author.First().Name.Should().Be("The Walking Dead"); - } - - [Test] - public void should_prefer_the_walking_dead_over_talking_dead_when_searching_for_walking_dead() - { - WithSeries("Talking Dead"); - WithSeries("The Walking Dead"); - - _author.Sort(new SearchAuthorComparer("walking dead")); - - _author.First().Name.Should().Be("The Walking Dead"); - } - - [Test] - public void should_prefer_blacklist_over_the_blacklist_when_searching_for_blacklist() - { - WithSeries("The Blacklist"); - WithSeries("Blacklist"); - - _author.Sort(new SearchAuthorComparer("blacklist")); - - _author.First().Name.Should().Be("Blacklist"); - } - - [Test] - public void should_prefer_the_blacklist_over_blacklist_when_searching_for_the_blacklist() - { - WithSeries("Blacklist"); - WithSeries("The Blacklist"); - - _author.Sort(new SearchAuthorComparer("the blacklist")); - - _author.First().Name.Should().Be("The Blacklist"); - } - } -} diff --git a/src/NzbDrone.Core.Test/MusicTests/ArtistServiceTests/FindByNameInexactFixture.cs b/src/NzbDrone.Core.Test/MusicTests/ArtistServiceTests/FindByNameInexactFixture.cs index 4efe12068..660c1de69 100644 --- a/src/NzbDrone.Core.Test/MusicTests/ArtistServiceTests/FindByNameInexactFixture.cs +++ b/src/NzbDrone.Core.Test/MusicTests/ArtistServiceTests/FindByNameInexactFixture.cs @@ -34,10 +34,8 @@ namespace NzbDrone.Core.Test.MusicTests.AuthorServiceTests .Returns(_authors); } - [TestCase("The Black Eyde Peas", "The Black Eyed Peas")] - [TestCase("Black Eyed Peas", "The Black Eyed Peas")] + [TestCase("The Black Eyd Peas", "The Black Eyed Peas")] [TestCase("The Black eys", "The Black Keys")] - [TestCase("Black Keys", "The Black Keys")] public void should_find_author_in_db_by_name_inexact(string name, string expected) { var author = Subject.FindByNameInexact(name); @@ -46,20 +44,6 @@ namespace NzbDrone.Core.Test.MusicTests.AuthorServiceTests author.Name.Should().Be(expected); } - [Test] - public void should_find_author_when_the_is_omitted_from_start() - { - _authors = new List(); - _authors.Add(CreateAuthor("Black Keys")); - _authors.Add(CreateAuthor("The Black Eyed Peas")); - - Mocker.GetMock() - .Setup(s => s.All()) - .Returns(_authors); - - Subject.FindByNameInexact("The Black Keys").Should().NotBeNull(); - } - [TestCase("The Black Peas")] public void should_not_find_author_in_db_by_ambiguous_name(string name) { diff --git a/src/NzbDrone.Core/Books/Services/AuthorService.cs b/src/NzbDrone.Core/Books/Services/AuthorService.cs index bea58bdca..daba55576 100644 --- a/src/NzbDrone.Core/Books/Services/AuthorService.cs +++ b/src/NzbDrone.Core/Books/Services/AuthorService.cs @@ -99,21 +99,10 @@ namespace NzbDrone.Core.Books Func, string, Tuple, string>> tc = Tuple.Create; var scoringFunctions = new List, string>> { - tc((a, t) => a.CleanName.FuzzyMatch(t), cleanTitle), - tc((a, t) => a.Name.FuzzyMatch(t), title), - tc((a, t) => a.Name.ToLastFirst().FuzzyMatch(t), title), - tc((a, t) => a.Metadata.Value.Aliases.Concat(new List { a.Name }).Max(x => x.CleanAuthorName().FuzzyMatch(t)), cleanTitle), + tc((a, t) => a.Metadata.Value.Name.FuzzyMatch(t), title), + tc((a, t) => a.Metadata.Value.NameLastFirst.FuzzyMatch(t), title) }; - if (title.StartsWith("The ", StringComparison.CurrentCultureIgnoreCase)) - { - scoringFunctions.Add(tc((a, t) => a.CleanName.FuzzyMatch(t), title.Substring(4).CleanAuthorName())); - } - else - { - scoringFunctions.Add(tc((a, t) => a.CleanName.FuzzyMatch(t), "the" + cleanTitle)); - } - return scoringFunctions; } @@ -151,9 +140,8 @@ namespace NzbDrone.Core.Books Func, string, Tuple, string>> tc = Tuple.Create; var scoringFunctions = new List, string>> { - tc((a, t) => t.FuzzyContains(a.CleanName), cleanReportTitle), tc((a, t) => t.FuzzyContains(a.Metadata.Value.Name), reportTitle), - tc((a, t) => t.FuzzyContains(a.Metadata.Value.Name.ToLastFirst()), reportTitle) + tc((a, t) => t.FuzzyContains(a.Metadata.Value.NameLastFirst), reportTitle) }; return scoringFunctions; diff --git a/src/NzbDrone.Core/MetadataSource/SearchAuthorComparer.cs b/src/NzbDrone.Core/MetadataSource/SearchAuthorComparer.cs deleted file mode 100644 index 8f19602db..000000000 --- a/src/NzbDrone.Core/MetadataSource/SearchAuthorComparer.cs +++ /dev/null @@ -1,95 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text.RegularExpressions; -using NzbDrone.Common.Extensions; -using NzbDrone.Core.Books; - -namespace NzbDrone.Core.MetadataSource -{ - public class SearchAuthorComparer : IComparer - { - private static readonly Regex RegexCleanPunctuation = new Regex("[-._:]", RegexOptions.Compiled); - private static readonly Regex RegexCleanCountryYearPostfix = new Regex(@"(?<=.+)( \([A-Z]{2}\)| \(\d{4}\)| \([A-Z]{2}\) \(\d{4}\))$", RegexOptions.Compiled); - private static readonly Regex ArticleRegex = new Regex(@"^(a|an|the)\s", RegexOptions.IgnoreCase | RegexOptions.Compiled); - - public string SearchQuery { get; private set; } - - private readonly string _searchQueryWithoutYear; - private int? _year; - - public SearchAuthorComparer(string searchQuery) - { - SearchQuery = searchQuery; - - var match = Regex.Match(SearchQuery, @"^(?.+)\s+(?:\((?\d{4})\)|(?\d{4}))$"); - if (match.Success) - { - _searchQueryWithoutYear = match.Groups["query"].Value.ToLowerInvariant(); - _year = int.Parse(match.Groups["year"].Value); - } - else - { - _searchQueryWithoutYear = searchQuery.ToLowerInvariant(); - } - } - - public int Compare(Author x, Author y) - { - int result = 0; - - // Prefer exact matches - result = Compare(x, y, s => CleanPunctuation(s.Name).Equals(CleanPunctuation(SearchQuery))); - if (result != 0) - { - return -result; - } - - // Remove Articles (a/an/the) - result = Compare(x, y, s => CleanArticles(s.Name).Equals(CleanArticles(SearchQuery))); - if (result != 0) - { - return -result; - } - - // Prefer close matches - result = Compare(x, y, s => CleanPunctuation(s.Name).LevenshteinDistance(CleanPunctuation(SearchQuery)) <= 1); - if (result != 0) - { - return -result; - } - - return Compare(x, y, s => SearchQuery.LevenshteinDistanceClean(s.Name)); - } - - public int Compare(Author x, Author y, Func keySelector) - where T : IComparable - { - var keyX = keySelector(x); - var keyY = keySelector(y); - - return keyX.CompareTo(keyY); - } - - private string CleanPunctuation(string title) - { - title = RegexCleanPunctuation.Replace(title, ""); - - return title.ToLowerInvariant(); - } - - private string CleanTitle(string title) - { - title = RegexCleanPunctuation.Replace(title, ""); - title = RegexCleanCountryYearPostfix.Replace(title, ""); - - return title.ToLowerInvariant(); - } - - private string CleanArticles(string title) - { - title = ArticleRegex.Replace(title, ""); - - return title.Trim().ToLowerInvariant(); - } - } -}