You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
298 lines
9.2 KiB
298 lines
9.2 KiB
6 months ago
|
from __future__ import annotations
|
||
|
|
||
|
# built-in
|
||
|
from functools import reduce
|
||
|
from itertools import islice, permutations, repeat
|
||
|
from math import log
|
||
|
from typing import Sequence
|
||
|
|
||
|
# app
|
||
|
from .base import Base as _Base, BaseSimilarity as _BaseSimilarity
|
||
|
from .edit_based import DamerauLevenshtein
|
||
|
|
||
|
|
||
|
__all__ = [
|
||
|
'Jaccard', 'Sorensen', 'Tversky',
|
||
|
'Overlap', 'Cosine', 'Tanimoto', 'MongeElkan', 'Bag',
|
||
|
|
||
|
'jaccard', 'sorensen', 'tversky', 'sorensen_dice',
|
||
|
'overlap', 'cosine', 'tanimoto', 'monge_elkan', 'bag',
|
||
|
]
|
||
|
|
||
|
|
||
|
class Jaccard(_BaseSimilarity):
|
||
|
"""
|
||
|
Compute the Jaccard similarity between the two sequences.
|
||
|
They should contain hashable items.
|
||
|
The return value is a float between 0 and 1, where 1 means equal,
|
||
|
and 0 totally different.
|
||
|
|
||
|
https://en.wikipedia.org/wiki/Jaccard_index
|
||
|
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaccard.js
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
qval: int = 1,
|
||
|
as_set: bool = False,
|
||
|
external: bool = True,
|
||
|
) -> None:
|
||
|
self.qval = qval
|
||
|
self.as_set = as_set
|
||
|
self.external = external
|
||
|
|
||
|
def maximum(self, *sequences: Sequence) -> int:
|
||
|
return 1
|
||
|
|
||
|
def __call__(self, *sequences: Sequence) -> float:
|
||
|
result = self.quick_answer(*sequences)
|
||
|
if result is not None:
|
||
|
return result
|
||
|
|
||
|
sequences = self._get_counters(*sequences) # sets
|
||
|
intersection = self._intersect_counters(*sequences) # set
|
||
|
intersection = self._count_counters(intersection) # int
|
||
|
union = self._union_counters(*sequences) # set
|
||
|
union = self._count_counters(union) # int
|
||
|
return intersection / union
|
||
|
|
||
|
|
||
|
class Sorensen(_BaseSimilarity):
|
||
|
"""
|
||
|
Compute the Sorensen distance between the two sequences.
|
||
|
They should contain hashable items.
|
||
|
The return value is a float between 0 and 1, where 0 means equal,
|
||
|
and 1 totally different.
|
||
|
|
||
|
https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
|
||
|
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/dice.js
|
||
|
"""
|
||
|
|
||
|
def __init__(self, qval: int = 1, as_set: bool = False, external: bool = True) -> None:
|
||
|
self.qval = qval
|
||
|
self.as_set = as_set
|
||
|
self.external = external
|
||
|
|
||
|
def maximum(self, *sequences: Sequence) -> int:
|
||
|
return 1
|
||
|
|
||
|
def __call__(self, *sequences: Sequence) -> float:
|
||
|
result = self.quick_answer(*sequences)
|
||
|
if result is not None:
|
||
|
return result
|
||
|
|
||
|
sequences = self._get_counters(*sequences) # sets
|
||
|
count = sum(self._count_counters(s) for s in sequences)
|
||
|
intersection = self._intersect_counters(*sequences) # set
|
||
|
intersection = self._count_counters(intersection) # int
|
||
|
return 2.0 * intersection / count
|
||
|
|
||
|
|
||
|
class Tversky(_BaseSimilarity):
|
||
|
"""Tversky index
|
||
|
|
||
|
https://en.wikipedia.org/wiki/Tversky_index
|
||
|
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/tversky.js
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
qval: int = 1,
|
||
|
ks: Sequence[float] = None,
|
||
|
bias: float | None = None,
|
||
|
as_set: bool = False,
|
||
|
external: bool = True,
|
||
|
) -> None:
|
||
|
self.qval = qval
|
||
|
self.ks = ks or repeat(1)
|
||
|
self.bias = bias
|
||
|
self.as_set = as_set
|
||
|
self.external = external
|
||
|
|
||
|
def maximum(self, *sequences: Sequence) -> int:
|
||
|
return 1
|
||
|
|
||
|
def __call__(self, *sequences: Sequence) -> float:
|
||
|
quick_result = self.quick_answer(*sequences)
|
||
|
if quick_result is not None:
|
||
|
return quick_result
|
||
|
|
||
|
sequences = self._get_counters(*sequences) # sets
|
||
|
intersection = self._intersect_counters(*sequences) # set
|
||
|
intersection = self._count_counters(intersection) # int
|
||
|
sequences = [self._count_counters(s) for s in sequences] # ints
|
||
|
ks = list(islice(self.ks, len(sequences)))
|
||
|
|
||
|
if len(sequences) != 2 or self.bias is None:
|
||
|
result = intersection
|
||
|
for k, s in zip(ks, sequences):
|
||
|
result += k * (s - intersection)
|
||
|
return intersection / result
|
||
|
|
||
|
s1, s2 = sequences
|
||
|
alpha, beta = ks
|
||
|
a_val = min([s1, s2])
|
||
|
b_val = max([s1, s2])
|
||
|
c_val = intersection + self.bias
|
||
|
result = alpha * beta * (a_val - b_val) + b_val * beta
|
||
|
return c_val / (result + c_val)
|
||
|
|
||
|
|
||
|
class Overlap(_BaseSimilarity):
|
||
|
"""overlap coefficient
|
||
|
|
||
|
https://en.wikipedia.org/wiki/Overlap_coefficient
|
||
|
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/overlap.js
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
qval: int = 1,
|
||
|
as_set: bool = False,
|
||
|
external: bool = True,
|
||
|
) -> None:
|
||
|
self.qval = qval
|
||
|
self.as_set = as_set
|
||
|
self.external = external
|
||
|
|
||
|
def maximum(self, *sequences: Sequence) -> int:
|
||
|
return 1
|
||
|
|
||
|
def __call__(self, *sequences: Sequence) -> float:
|
||
|
result = self.quick_answer(*sequences)
|
||
|
if result is not None:
|
||
|
return result
|
||
|
|
||
|
sequences = self._get_counters(*sequences) # sets
|
||
|
intersection = self._intersect_counters(*sequences) # set
|
||
|
intersection = self._count_counters(intersection) # int
|
||
|
sequences = [self._count_counters(s) for s in sequences] # ints
|
||
|
|
||
|
return intersection / min(sequences)
|
||
|
|
||
|
|
||
|
class Cosine(_BaseSimilarity):
|
||
|
"""cosine similarity (Ochiai coefficient)
|
||
|
|
||
|
https://en.wikipedia.org/wiki/Cosine_similarity
|
||
|
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/cosine.js
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
qval: int = 1,
|
||
|
as_set: bool = False,
|
||
|
external: bool = True,
|
||
|
) -> None:
|
||
|
self.qval = qval
|
||
|
self.as_set = as_set
|
||
|
self.external = external
|
||
|
|
||
|
def maximum(self, *sequences: Sequence) -> int:
|
||
|
return 1
|
||
|
|
||
|
def __call__(self, *sequences: Sequence) -> float:
|
||
|
result = self.quick_answer(*sequences)
|
||
|
if result is not None:
|
||
|
return result
|
||
|
|
||
|
sequences = self._get_counters(*sequences) # sets
|
||
|
intersection = self._intersect_counters(*sequences) # set
|
||
|
intersection = self._count_counters(intersection) # int
|
||
|
sequences = [self._count_counters(s) for s in sequences] # ints
|
||
|
prod = reduce(lambda x, y: x * y, sequences)
|
||
|
|
||
|
return intersection / pow(prod, 1.0 / len(sequences))
|
||
|
|
||
|
|
||
|
class Tanimoto(Jaccard):
|
||
|
"""Tanimoto distance
|
||
|
This is identical to the Jaccard similarity coefficient
|
||
|
and the Tversky index for alpha=1 and beta=1.
|
||
|
"""
|
||
|
|
||
|
def __call__(self, *sequences: Sequence) -> float:
|
||
|
result = super().__call__(*sequences)
|
||
|
if result == 0:
|
||
|
return float('-inf')
|
||
|
else:
|
||
|
return log(result, 2)
|
||
|
|
||
|
|
||
|
class MongeElkan(_BaseSimilarity):
|
||
|
"""
|
||
|
https://www.academia.edu/200314/Generalized_Monge-Elkan_Method_for_Approximate_Text_String_Comparison
|
||
|
http://www.cs.cmu.edu/~wcohen/postscript/kdd-2003-match-ws.pdf
|
||
|
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/monge-elkan.js
|
||
|
"""
|
||
|
_damerau_levenshtein = DamerauLevenshtein()
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
algorithm=_damerau_levenshtein,
|
||
|
symmetric: bool = False,
|
||
|
qval: int = 1,
|
||
|
external: bool = True,
|
||
|
) -> None:
|
||
|
self.algorithm = algorithm
|
||
|
self.symmetric = symmetric
|
||
|
self.qval = qval
|
||
|
self.external = external
|
||
|
|
||
|
def maximum(self, *sequences: Sequence) -> float:
|
||
|
result = self.algorithm.maximum(sequences)
|
||
|
for seq in sequences:
|
||
|
if seq:
|
||
|
result = max(result, self.algorithm.maximum(*seq))
|
||
|
return result
|
||
|
|
||
|
def _calc(self, seq, *sequences: Sequence) -> float:
|
||
|
if not seq:
|
||
|
return 0
|
||
|
maxes = []
|
||
|
for c1 in seq:
|
||
|
for s in sequences:
|
||
|
max_sim = float('-inf')
|
||
|
for c2 in s:
|
||
|
max_sim = max(max_sim, self.algorithm.similarity(c1, c2))
|
||
|
maxes.append(max_sim)
|
||
|
return sum(maxes) / len(seq) / len(maxes)
|
||
|
|
||
|
def __call__(self, *sequences: Sequence) -> float:
|
||
|
quick_result = self.quick_answer(*sequences)
|
||
|
if quick_result is not None:
|
||
|
return quick_result
|
||
|
sequences = self._get_sequences(*sequences)
|
||
|
|
||
|
if self.symmetric:
|
||
|
result = []
|
||
|
for seqs in permutations(sequences):
|
||
|
result.append(self._calc(*seqs))
|
||
|
return sum(result) / len(result)
|
||
|
else:
|
||
|
return self._calc(*sequences)
|
||
|
|
||
|
|
||
|
class Bag(_Base):
|
||
|
"""Bag distance
|
||
|
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/bag.js
|
||
|
"""
|
||
|
|
||
|
def __call__(self, *sequences: Sequence) -> float:
|
||
|
sequences = self._get_counters(*sequences) # sets
|
||
|
intersection = self._intersect_counters(*sequences) # set
|
||
|
return max(self._count_counters(sequence - intersection) for sequence in sequences)
|
||
|
|
||
|
|
||
|
bag = Bag()
|
||
|
cosine = Cosine()
|
||
|
dice = Sorensen()
|
||
|
jaccard = Jaccard()
|
||
|
monge_elkan = MongeElkan()
|
||
|
overlap = Overlap()
|
||
|
sorensen = Sorensen()
|
||
|
sorensen_dice = Sorensen()
|
||
|
# sorensen_dice = Tversky(ks=[.5, .5])
|
||
|
tanimoto = Tanimoto()
|
||
|
tversky = Tversky()
|