bazarr/libs/textdistance/algorithms/sequence_based.py

from __future__ import annotations

# built-in
from difflib import SequenceMatcher as _SequenceMatcher
from typing import Any

# app
from ..utils import find_ngrams
from .base import BaseSimilarity as _BaseSimilarity
from .types import TestFunc


try:
    # external
    import numpy
except ImportError:
    # built-in
    from array import array
    numpy = None  # type: ignore[assignment]


__all__ = [
    'lcsseq', 'lcsstr', 'ratcliff_obershelp',
    'LCSSeq', 'LCSStr', 'RatcliffObershelp',
]


class LCSSeq(_BaseSimilarity):
    """longest common subsequence similarity

    https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
    """

    def __init__(
        self,
        qval: int = 1,
        test_func: TestFunc = None,
        external: bool = True,
    ) -> None:
        self.qval = qval
        self.test_func = test_func or self._ident
        self.external = external

    def _dynamic(self, seq1: str, seq2: str) -> str:
        """
        https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py
        http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py
        http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8
        """
        lengths: Any
        if numpy:
            lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=int)
        else:
            lengths = [array('L', [0] * (len(seq2) + 1)) for _ in range(len(seq1) + 1)]

        # row 0 and column 0 are initialized to 0 already
        for i, char1 in enumerate(seq1):
            for j, char2 in enumerate(seq2):
                if char1 == char2:
                    lengths[i + 1][j + 1] = lengths[i][j] + 1
                else:
                    lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])

        # read the substring out from the matrix
        result = ''
        i, j = len(seq1), len(seq2)
        while i != 0 and j != 0:
            if lengths[i][j] == lengths[i - 1][j]:
                i -= 1
            elif lengths[i][j] == lengths[i][j - 1]:
                j -= 1
            else:
                assert seq1[i - 1] == seq2[j - 1]
                result = seq1[i - 1] + result
                i -= 1
                j -= 1
        return result

    def _recursive(self, *sequences: str) -> str:
        if not all(sequences):
            return type(sequences[0])()  # empty sequence
        if self.test_func(*[s[-1] for s in sequences]):
            c = sequences[0][-1]
            sequences = tuple(s[:-1] for s in sequences)
            return self(*sequences) + c
        m = type(sequences[0])()  # empty sequence
        for i, s in enumerate(sequences):
            ss = sequences[:i] + (s[:-1], ) + sequences[i + 1:]
            m = max([self(*ss), m], key=len)
        return m

    def __call__(self, *sequences: str) -> str:
        if not sequences:
            return ''
        sequences = self._get_sequences(*sequences)
        if len(sequences) == 2:
            return self._dynamic(*sequences)
        else:
            return self._recursive(*sequences)

    def similarity(self, *sequences) -> int:
        return len(self(*sequences))


class LCSStr(_BaseSimilarity):
    """longest common substring similarity
    """

    def _standart(self, s1: str, s2: str) -> str:
        matcher = _SequenceMatcher(a=s1, b=s2)
        match = matcher.find_longest_match(0, len(s1), 0, len(s2))
        return s1[match.a: match.a + match.size]

    def _custom(self, *sequences: str) -> str:
        short = min(sequences, key=len)
        length = len(short)
        for n in range(length, 0, -1):
            for subseq in find_ngrams(short, n):
                joined = ''.join(subseq)
                for seq in sequences:
                    if joined not in seq:
                        break
                else:
                    return joined
        return type(short)()  # empty sequence

    def __call__(self, *sequences: str) -> str:
        if not all(sequences):
            return ''
        length = len(sequences)
        if length == 0:
            return ''
        if length == 1:
            return sequences[0]

        sequences = self._get_sequences(*sequences)
        if length == 2 and max(map(len, sequences)) < 200:
            return self._standart(*sequences)
        return self._custom(*sequences)

    def similarity(self, *sequences: str) -> int:
        return len(self(*sequences))


class RatcliffObershelp(_BaseSimilarity):
    """Ratcliff-Obershelp similarity
    This follows the Ratcliff-Obershelp algorithm to derive a similarity
    measure:
        1. Find the length of the longest common substring in sequences.
        2. Recurse on the strings to the left & right of each this substring
           in sequences. The base case is a 0 length common substring, in which
           case, return 0. Otherwise, return the sum of the current longest
           common substring and the left & right recursed sums.
        3. Multiply this length by 2 and divide by the sum of the lengths of
           sequences.

    https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching
    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/ratcliff-obershelp.js
    https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html
    """

    def maximum(self, *sequences: str) -> int:
        return 1

    def _find(self, *sequences: str) -> int:
        subseq = LCSStr()(*sequences)
        length = len(subseq)
        if length == 0:
            return 0
        before = [s[:s.find(subseq)] for s in sequences]
        after = [s[s.find(subseq) + length:] for s in sequences]
        return self._find(*before) + length + self._find(*after)

    def __call__(self, *sequences: str) -> float:
        result = self.quick_answer(*sequences)
        if result is not None:
            return result
        scount = len(sequences)  # sequences count
        ecount = sum(map(len, sequences))  # elements count
        sequences = self._get_sequences(*sequences)
        return scount * self._find(*sequences) / ecount


lcsseq = LCSSeq()
lcsstr = LCSStr()
ratcliff_obershelp = RatcliffObershelp()
Improved global search function * Use Hamming textdistance library Used Hamming textdistance to sort by closest match. * Global search UI improvements Increased dropdown height to show more results initially (and which can also be scrolled into view). Scrollbars will appear automatically as needed. Remove dropdown when Search box is cleared. * Added textdistance 4.6.2 library 6 months ago			`from __future__ import annotations`

			`# built-in`
			`from difflib import SequenceMatcher as _SequenceMatcher`
			`from typing import Any`

			`# app`
			`from ..utils import find_ngrams`
			`from .base import BaseSimilarity as _BaseSimilarity`
			`from .types import TestFunc`


			`try:`
			`# external`
			`import numpy`
			`except ImportError:`
			`# built-in`
			`from array import array`
			`numpy = None # type: ignore[assignment]`


			`__all__ = [`
			`'lcsseq', 'lcsstr', 'ratcliff_obershelp',`
			`'LCSSeq', 'LCSStr', 'RatcliffObershelp',`
			`]`


			`class LCSSeq(_BaseSimilarity):`
			`"""longest common subsequence similarity`

			`https://en.wikipedia.org/wiki/Longest_common_subsequence_problem`
			`"""`

			`def __init__(`
			`self,`
			`qval: int = 1,`
			`test_func: TestFunc = None,`
			`external: bool = True,`
			`) -> None:`
			`self.qval = qval`
			`self.test_func = test_func or self._ident`
			`self.external = external`

			`def _dynamic(self, seq1: str, seq2: str) -> str:`
			`"""`
			`https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py`
			`http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py`
			`http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8`
			`"""`
			`lengths: Any`
			`if numpy:`
			`lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=int)`
			`else:`
			`lengths = [array('L', [0] * (len(seq2) + 1)) for _ in range(len(seq1) + 1)]`

			`# row 0 and column 0 are initialized to 0 already`
			`for i, char1 in enumerate(seq1):`
			`for j, char2 in enumerate(seq2):`
			`if char1 == char2:`
			`lengths[i + 1][j + 1] = lengths[i][j] + 1`
			`else:`
			`lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])`

			`# read the substring out from the matrix`
			`result = ''`
			`i, j = len(seq1), len(seq2)`
			`while i != 0 and j != 0:`
			`if lengths[i][j] == lengths[i - 1][j]:`
			`i -= 1`
			`elif lengths[i][j] == lengths[i][j - 1]:`
			`j -= 1`
			`else:`
			`assert seq1[i - 1] == seq2[j - 1]`
			`result = seq1[i - 1] + result`
			`i -= 1`
			`j -= 1`
			`return result`

			`def _recursive(self, *sequences: str) -> str:`
			`if not all(sequences):`
			`return type(sequences[0])() # empty sequence`
			`if self.test_func(*[s[-1] for s in sequences]):`
			`c = sequences[0][-1]`
			`sequences = tuple(s[:-1] for s in sequences)`
			`return self(*sequences) + c`
			`m = type(sequences[0])() # empty sequence`
			`for i, s in enumerate(sequences):`
			`ss = sequences[:i] + (s[:-1], ) + sequences[i + 1:]`
			`m = max([self(*ss), m], key=len)`
			`return m`

			`def __call__(self, *sequences: str) -> str:`
			`if not sequences:`
			`return ''`
			`sequences = self._get_sequences(*sequences)`
			`if len(sequences) == 2:`
			`return self._dynamic(*sequences)`
			`else:`
			`return self._recursive(*sequences)`

			`def similarity(self, *sequences) -> int:`
			`return len(self(*sequences))`


			`class LCSStr(_BaseSimilarity):`
			`"""longest common substring similarity`
			`"""`

			`def _standart(self, s1: str, s2: str) -> str:`
			`matcher = _SequenceMatcher(a=s1, b=s2)`
			`match = matcher.find_longest_match(0, len(s1), 0, len(s2))`
			`return s1[match.a: match.a + match.size]`

			`def _custom(self, *sequences: str) -> str:`
			`short = min(sequences, key=len)`
			`length = len(short)`
			`for n in range(length, 0, -1):`
			`for subseq in find_ngrams(short, n):`
			`joined = ''.join(subseq)`
			`for seq in sequences:`
			`if joined not in seq:`
			`break`
			`else:`
			`return joined`
			`return type(short)() # empty sequence`

			`def __call__(self, *sequences: str) -> str:`
			`if not all(sequences):`
			`return ''`
			`length = len(sequences)`
			`if length == 0:`
			`return ''`
			`if length == 1:`
			`return sequences[0]`

			`sequences = self._get_sequences(*sequences)`
			`if length == 2 and max(map(len, sequences)) < 200:`
			`return self._standart(*sequences)`
			`return self._custom(*sequences)`

			`def similarity(self, *sequences: str) -> int:`
			`return len(self(*sequences))`


			`class RatcliffObershelp(_BaseSimilarity):`
			`"""Ratcliff-Obershelp similarity`
			`This follows the Ratcliff-Obershelp algorithm to derive a similarity`
			`measure:`
			`1. Find the length of the longest common substring in sequences.`
			`2. Recurse on the strings to the left & right of each this substring`
			`in sequences. The base case is a 0 length common substring, in which`
			`case, return 0. Otherwise, return the sum of the current longest`
			`common substring and the left & right recursed sums.`
			`3. Multiply this length by 2 and divide by the sum of the lengths of`
			`sequences.`

			`https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching`
			`https://github.com/Yomguithereal/talisman/blob/master/src/metrics/ratcliff-obershelp.js`
			`https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html`
			`"""`

			`def maximum(self, *sequences: str) -> int:`
			`return 1`

			`def _find(self, *sequences: str) -> int:`
			`subseq = LCSStr()(*sequences)`
			`length = len(subseq)`
			`if length == 0:`
			`return 0`
			`before = [s[:s.find(subseq)] for s in sequences]`
			`after = [s[s.find(subseq) + length:] for s in sequences]`
			`return self._find(before) + length + self._find(after)`

			`def __call__(self, *sequences: str) -> float:`
			`result = self.quick_answer(*sequences)`
			`if result is not None:`
			`return result`
			`scount = len(sequences) # sequences count`
			`ecount = sum(map(len, sequences)) # elements count`
			`sequences = self._get_sequences(*sequences)`
			`return scount * self._find(*sequences) / ecount`


			`lcsseq = LCSSeq()`
			`lcsstr = LCSStr()`
			`ratcliff_obershelp = RatcliffObershelp()`