You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
138 lines
3.4 KiB
138 lines
3.4 KiB
import sys
|
|
from difflib import SequenceMatcher
|
|
from random import randint
|
|
|
|
|
|
IS_PY3K = sys.version_info[0] == 3
|
|
|
|
# String UDF.
|
|
def damerau_levenshtein_dist(s1, s2):
|
|
cdef:
|
|
int i, j, del_cost, add_cost, sub_cost
|
|
int s1_len = len(s1), s2_len = len(s2)
|
|
list one_ago, two_ago, current_row
|
|
list zeroes = [0] * (s2_len + 1)
|
|
|
|
if IS_PY3K:
|
|
current_row = list(range(1, s2_len + 2))
|
|
else:
|
|
current_row = range(1, s2_len + 2)
|
|
|
|
current_row[-1] = 0
|
|
one_ago = None
|
|
|
|
for i in range(s1_len):
|
|
two_ago = one_ago
|
|
one_ago = current_row
|
|
current_row = list(zeroes)
|
|
current_row[-1] = i + 1
|
|
for j in range(s2_len):
|
|
del_cost = one_ago[j] + 1
|
|
add_cost = current_row[j - 1] + 1
|
|
sub_cost = one_ago[j - 1] + (s1[i] != s2[j])
|
|
current_row[j] = min(del_cost, add_cost, sub_cost)
|
|
|
|
# Handle transpositions.
|
|
if (i > 0 and j > 0 and s1[i] == s2[j - 1]
|
|
and s1[i-1] == s2[j] and s1[i] != s2[j]):
|
|
current_row[j] = min(current_row[j], two_ago[j - 2] + 1)
|
|
|
|
return current_row[s2_len - 1]
|
|
|
|
# String UDF.
|
|
def levenshtein_dist(a, b):
|
|
cdef:
|
|
int add, delete, change
|
|
int i, j
|
|
int n = len(a), m = len(b)
|
|
list current, previous
|
|
list zeroes
|
|
|
|
if n > m:
|
|
a, b = b, a
|
|
n, m = m, n
|
|
|
|
zeroes = [0] * (m + 1)
|
|
|
|
if IS_PY3K:
|
|
current = list(range(n + 1))
|
|
else:
|
|
current = range(n + 1)
|
|
|
|
for i in range(1, m + 1):
|
|
previous = current
|
|
current = list(zeroes)
|
|
current[0] = i
|
|
|
|
for j in range(1, n + 1):
|
|
add = previous[j] + 1
|
|
delete = current[j - 1] + 1
|
|
change = previous[j - 1]
|
|
if a[j - 1] != b[i - 1]:
|
|
change +=1
|
|
current[j] = min(add, delete, change)
|
|
|
|
return current[n]
|
|
|
|
# String UDF.
|
|
def str_dist(a, b):
|
|
cdef:
|
|
int t = 0
|
|
|
|
for i in SequenceMatcher(None, a, b).get_opcodes():
|
|
if i[0] == 'equal':
|
|
continue
|
|
t = t + max(i[4] - i[3], i[2] - i[1])
|
|
return t
|
|
|
|
# Math Aggregate.
|
|
cdef class median(object):
|
|
cdef:
|
|
int ct
|
|
list items
|
|
|
|
def __init__(self):
|
|
self.ct = 0
|
|
self.items = []
|
|
|
|
cdef selectKth(self, int k, int s=0, int e=-1):
|
|
cdef:
|
|
int idx
|
|
if e < 0:
|
|
e = len(self.items)
|
|
idx = randint(s, e-1)
|
|
idx = self.partition_k(idx, s, e)
|
|
if idx > k:
|
|
return self.selectKth(k, s, idx)
|
|
elif idx < k:
|
|
return self.selectKth(k, idx + 1, e)
|
|
else:
|
|
return self.items[idx]
|
|
|
|
cdef int partition_k(self, int pi, int s, int e):
|
|
cdef:
|
|
int i, x
|
|
|
|
val = self.items[pi]
|
|
# Swap pivot w/last item.
|
|
self.items[e - 1], self.items[pi] = self.items[pi], self.items[e - 1]
|
|
x = s
|
|
for i in range(s, e):
|
|
if self.items[i] < val:
|
|
self.items[i], self.items[x] = self.items[x], self.items[i]
|
|
x += 1
|
|
self.items[x], self.items[e-1] = self.items[e-1], self.items[x]
|
|
return x
|
|
|
|
def step(self, item):
|
|
self.items.append(item)
|
|
self.ct += 1
|
|
|
|
def finalize(self):
|
|
if self.ct == 0:
|
|
return None
|
|
elif self.ct < 3:
|
|
return self.items[0]
|
|
else:
|
|
return self.selectKth(self.ct / 2)
|