|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# pylint: disable=pointless-statement, missing-docstring, no-member, len-as-condition
|
|
|
|
|
|
|
|
from ..rebulk import Rebulk
|
|
|
|
from ..rules import Rule
|
|
|
|
from . import rebulk_rules_module as rm
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_simple():
|
|
|
|
rebulk = Rebulk()
|
|
|
|
|
|
|
|
rebulk.string("quick")
|
|
|
|
rebulk.regex("f.x")
|
|
|
|
|
|
|
|
def func(input_string):
|
|
|
|
i = input_string.find("over")
|
|
|
|
if i > -1:
|
|
|
|
return i, i + len("over")
|
|
|
|
|
|
|
|
rebulk.functional(func)
|
|
|
|
|
|
|
|
input_string = "The quick brown fox jumps over the lazy dog"
|
|
|
|
|
|
|
|
matches = rebulk.matches(input_string)
|
|
|
|
assert len(matches) == 3
|
|
|
|
|
|
|
|
assert matches[0].value == "quick"
|
|
|
|
assert matches[1].value == "fox"
|
|
|
|
assert matches[2].value == "over"
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_composition():
|
|
|
|
rebulk = Rebulk()
|
|
|
|
|
|
|
|
rebulk.string("quick")
|
|
|
|
rebulk.rebulk(Rebulk().regex("f.x"))
|
|
|
|
|
|
|
|
rebulk.rebulk(Rebulk(disabled=lambda context: True).functional(lambda string: None))
|
|
|
|
|
|
|
|
input_string = "The quick brown fox jumps over the lazy dog"
|
|
|
|
|
|
|
|
matches = rebulk.matches(input_string)
|
|
|
|
assert len(matches) == 2
|
|
|
|
|
|
|
|
assert matches[0].value == "quick"
|
|
|
|
assert matches[1].value == "fox"
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_context():
|
|
|
|
rebulk = Rebulk()
|
|
|
|
|
|
|
|
context = {'nostring': True, 'word': 'lazy'}
|
|
|
|
|
|
|
|
rebulk.string("quick", disabled=lambda context: context.get('nostring', False))
|
|
|
|
rebulk.regex("f.x", disabled=lambda context: context.get('noregex', False))
|
|
|
|
|
|
|
|
def func(input_string, context):
|
|
|
|
word = context.get('word', 'over')
|
|
|
|
i = input_string.find(word)
|
|
|
|
if i > -1:
|
|
|
|
return i, i + len(word)
|
|
|
|
|
|
|
|
rebulk.functional(func)
|
|
|
|
|
|
|
|
input_string = "The quick brown fox jumps over the lazy dog"
|
|
|
|
|
|
|
|
matches = rebulk.matches(input_string, context)
|
|
|
|
assert len(matches) == 2
|
|
|
|
|
|
|
|
assert matches[0].value == "fox"
|
|
|
|
assert matches[1].value == "lazy"
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_prefer_longer():
|
|
|
|
input_string = "The quick brown fox jumps over the lazy dog"
|
|
|
|
|
|
|
|
matches = Rebulk().string("quick").string("own").regex("br.{2}n").matches(input_string)
|
|
|
|
|
|
|
|
assert len(matches) == 2
|
|
|
|
|
|
|
|
assert matches[0].value == "quick"
|
|
|
|
assert matches[1].value == "brown"
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_defaults():
|
|
|
|
input_string = "The quick brown fox jumps over the lazy dog"
|
|
|
|
|
|
|
|
def func(input_string):
|
|
|
|
i = input_string.find("fox")
|
|
|
|
if i > -1:
|
|
|
|
return i, i + len("fox")
|
|
|
|
|
|
|
|
matches = Rebulk()\
|
|
|
|
.string_defaults(name="string", tags=["a", "b"])\
|
|
|
|
.regex_defaults(name="regex") \
|
|
|
|
.functional_defaults(name="functional") \
|
|
|
|
.string("quick", tags=["c"])\
|
|
|
|
.functional(func)\
|
|
|
|
.regex("br.{2}n") \
|
|
|
|
.matches(input_string)
|
|
|
|
assert matches[0].name == "string"
|
|
|
|
assert matches[0].tags == ["a", "b", "c"]
|
|
|
|
assert matches[1].name == "functional"
|
|
|
|
assert matches[2].name == "regex"
|
|
|
|
|
|
|
|
matches = Rebulk() \
|
|
|
|
.defaults(name="default", tags=["0"])\
|
|
|
|
.string_defaults(name="string", tags=["a", "b"]) \
|
|
|
|
.functional_defaults(name="functional", tags=["1"]) \
|
|
|
|
.string("quick", tags=["c"]) \
|
|
|
|
.functional(func) \
|
|
|
|
.regex("br.{2}n") \
|
|
|
|
.matches(input_string)
|
|
|
|
assert matches[0].name == "string"
|
|
|
|
assert matches[0].tags == ["0", "a", "b", "c"]
|
|
|
|
assert matches[1].name == "functional"
|
|
|
|
assert matches[1].tags == ["0", "1"]
|
|
|
|
assert matches[2].name == "default"
|
|
|
|
assert matches[2].tags == ["0"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_defaults_overrides():
|
|
|
|
input_string = "The quick brown fox jumps over the lazy dog"
|
|
|
|
|
|
|
|
def func(input_string):
|
|
|
|
i = input_string.find("fox")
|
|
|
|
if i > -1:
|
|
|
|
return i, i + len("fox")
|
|
|
|
|
|
|
|
matches = Rebulk() \
|
|
|
|
.string_defaults(name="string", tags=["a", "b"]) \
|
|
|
|
.regex_defaults(name="regex", tags=["d"]) \
|
|
|
|
.functional_defaults(name="functional") \
|
|
|
|
.string("quick", tags=["c"], overrides=["tags"]) \
|
|
|
|
.functional(func) \
|
|
|
|
.regex("br.{2}n") \
|
|
|
|
.matches(input_string)
|
|
|
|
assert matches[0].name == "string"
|
|
|
|
assert matches[0].tags == ["c"]
|
|
|
|
assert matches[1].name == "functional"
|
|
|
|
assert matches[2].name == "regex"
|
|
|
|
assert matches[2].tags == ["d"]
|
|
|
|
|
|
|
|
matches = Rebulk() \
|
|
|
|
.defaults(name="default", tags=["0"]) \
|
|
|
|
.string_defaults(name="string", tags=["a", "b"]) \
|
|
|
|
.functional_defaults(name="functional", tags=["1"]) \
|
|
|
|
.string("quick", tags=["c"]) \
|
|
|
|
.functional(func) \
|
|
|
|
.regex("br.{2}n") \
|
|
|
|
.matches(input_string)
|
|
|
|
assert matches[0].name == "string"
|
|
|
|
assert matches[0].tags == ["0", "a", "b", "c"]
|
|
|
|
assert matches[1].name == "functional"
|
|
|
|
assert matches[1].tags == ["0", "1"]
|
|
|
|
assert matches[2].name == "default"
|
|
|
|
assert matches[2].tags == ["0"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_rebulk():
|
|
|
|
input_string = "The quick brown fox jumps over the lazy dog"
|
|
|
|
|
|
|
|
base = Rebulk().string("quick")
|
|
|
|
child = Rebulk().string("own").regex("br.{2}n")
|
|
|
|
|
|
|
|
matches = base.rebulk(child).matches(input_string)
|
|
|
|
|
|
|
|
assert len(matches) == 2
|
|
|
|
|
|
|
|
assert matches[0].value == "quick"
|
|
|
|
assert matches[1].value == "brown"
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_no_default():
|
|
|
|
input_string = "The quick brown fox jumps over the lazy dog"
|
|
|
|
|
|
|
|
matches = Rebulk(default_rules=False).string("quick").string("own").regex("br.{2}n").matches(input_string)
|
|
|
|
|
|
|
|
assert len(matches) == 3
|
|
|
|
|
|
|
|
assert matches[0].value == "quick"
|
|
|
|
assert matches[1].value == "own"
|
|
|
|
assert matches[2].value == "brown"
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_empty_match():
|
|
|
|
input_string = "The quick brown fox jumps over the lazy dog"
|
|
|
|
|
|
|
|
matches = Rebulk(default_rules=False).string("quick").string("own").regex("br(.*?)own", children=True)\
|
|
|
|
.matches(input_string)
|
|
|
|
|
|
|
|
assert len(matches) == 2
|
|
|
|
|
|
|
|
assert matches[0].value == "quick"
|
|
|
|
assert matches[1].value == "own"
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_tags_names():
|
|
|
|
rebulk = Rebulk()
|
|
|
|
|
|
|
|
rebulk.string("quick", name="str", tags=["first", "other"])
|
|
|
|
rebulk.regex("f.x", tags="other")
|
|
|
|
|
|
|
|
def func(input_string):
|
|
|
|
i = input_string.find("over")
|
|
|
|
if i > -1:
|
|
|
|
return i, i + len("over"), {'tags': ['custom']}
|
|
|
|
|
|
|
|
rebulk.functional(func, name="fn")
|
|
|
|
|
|
|
|
def func2(input_string):
|
|
|
|
i = input_string.find("lazy")
|
|
|
|
if i > -1:
|
|
|
|
return {'start': i, 'end': i + len("lazy"), 'tags': ['custom']}
|
|
|
|
|
|
|
|
rebulk.functional(func2, name="fn")
|
|
|
|
|
|
|
|
input_string = "The quick brown fox jumps over the lazy dog"
|
|
|
|
|
|
|
|
matches = rebulk.matches(input_string)
|
|
|
|
assert len(matches) == 4
|
|
|
|
|
|
|
|
assert len(matches.named("str")) == 1
|
|
|
|
assert len(matches.named("fn")) == 2
|
|
|
|
assert len(matches.named("false")) == 0
|
|
|
|
assert len(matches.tagged("false")) == 0
|
|
|
|
assert len(matches.tagged("first")) == 1
|
|
|
|
assert len(matches.tagged("other")) == 2
|
|
|
|
assert len(matches.tagged("custom")) == 2
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_rules_1():
|
|
|
|
rebulk = Rebulk()
|
|
|
|
|
|
|
|
rebulk.regex(r'\d{4}', name="year")
|
|
|
|
rebulk.rules(rm.RemoveAllButLastYear)
|
|
|
|
|
|
|
|
matches = rebulk.matches("1984 keep only last 1968 entry 1982 case")
|
|
|
|
assert len(matches) == 1
|
|
|
|
assert matches[0].value == "1982"
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_rules_2():
|
|
|
|
rebulk = Rebulk()
|
|
|
|
|
|
|
|
rebulk.regex(r'\d{4}', name="year")
|
|
|
|
rebulk.string(r'year', name="yearPrefix", private=True)
|
|
|
|
rebulk.string(r'keep', name="yearSuffix", private=True)
|
|
|
|
rebulk.rules(rm.PrefixedSuffixedYear)
|
|
|
|
|
|
|
|
matches = rebulk.matches("Keep suffix 1984 keep prefixed year 1968 and remove the rest 1982")
|
|
|
|
assert len(matches) == 2
|
|
|
|
assert matches[0].value == "1984"
|
|
|
|
assert matches[1].value == "1968"
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_rules_3():
|
|
|
|
rebulk = Rebulk()
|
|
|
|
|
|
|
|
rebulk.regex(r'\d{4}', name="year")
|
|
|
|
rebulk.string(r'year', name="yearPrefix", private=True)
|
|
|
|
rebulk.string(r'keep', name="yearSuffix", private=True)
|
|
|
|
rebulk.rules(rm.PrefixedSuffixedYearNoLambda)
|
|
|
|
|
|
|
|
matches = rebulk.matches("Keep suffix 1984 keep prefixed year 1968 and remove the rest 1982")
|
|
|
|
assert len(matches) == 2
|
|
|
|
assert matches[0].value == "1984"
|
|
|
|
assert matches[1].value == "1968"
|
|
|
|
|
|
|
|
|
|
|
|
def test_rebulk_rules_4():
|
|
|
|
class FirstOnlyRule(Rule):
|
|
|
|
def when(self, matches, context):
|
|
|
|
grabbed = matches.named("grabbed", 0)
|
|
|
|
if grabbed and matches.previous(grabbed):
|
|
|
|
return grabbed
|
|
|
|
|
|
|
|
def then(self, matches, when_response, context):
|
|
|
|
matches.remove(when_response)
|
|
|
|
|
|
|
|
rebulk = Rebulk()
|
|
|
|
|
|
|
|
rebulk.regex("This match (.*?)grabbed", name="grabbed")
|
|
|
|
rebulk.regex("if it's (.*?)first match", private=True)
|
|
|
|
|
|
|
|
rebulk.rules(FirstOnlyRule)
|
|
|
|
|
|
|
|
matches = rebulk.matches("This match is grabbed only if it's the first match")
|
|
|
|
assert len(matches) == 1
|
|
|
|
assert matches[0].value == "This match is grabbed"
|
|
|
|
|
|
|
|
matches = rebulk.matches("if it's NOT the first match, This match is NOT grabbed")
|
|
|
|
assert len(matches) == 0
|
|
|
|
|
|
|
|
|
|
|
|
class TestMarkers:
|
|
|
|
def test_one_marker(self):
|
|
|
|
class MarkerRule(Rule):
|
|
|
|
def when(self, matches, context):
|
|
|
|
word_match = matches.named("word", 0)
|
|
|
|
marker = matches.markers.at_match(word_match, lambda marker: marker.name == "mark1", 0)
|
|
|
|
if not marker:
|
|
|
|
return word_match
|
|
|
|
|
|
|
|
def then(self, matches, when_response, context):
|
|
|
|
matches.remove(when_response)
|
|
|
|
|
|
|
|
rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \
|
|
|
|
.regex(r'\[.*?\]', marker=True, name="mark2") \
|
|
|
|
.string("word", name="word") \
|
|
|
|
.rules(MarkerRule)
|
|
|
|
|
|
|
|
matches = rebulk.matches("grab (word) only if it's in parenthesis")
|
|
|
|
|
|
|
|
assert len(matches) == 1
|
|
|
|
assert matches[0].value == "word"
|
|
|
|
|
|
|
|
matches = rebulk.matches("don't grab [word] if it's in braket")
|
|
|
|
assert len(matches) == 0
|
|
|
|
|
|
|
|
matches = rebulk.matches("don't grab word at all")
|
|
|
|
assert len(matches) == 0
|
|
|
|
|
|
|
|
def test_multiple_marker(self):
|
|
|
|
class MarkerRule(Rule):
|
|
|
|
def when(self, matches, context):
|
|
|
|
word_match = matches.named("word", 0)
|
|
|
|
marker = matches.markers.at_match(word_match,
|
|
|
|
lambda marker: marker.name in ["mark1", "mark2"])
|
|
|
|
if len(marker) < 2:
|
|
|
|
return word_match
|
|
|
|
|
|
|
|
def then(self, matches, when_response, context):
|
|
|
|
matches.remove(when_response)
|
|
|
|
|
|
|
|
rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \
|
|
|
|
.regex(r'\[.*?\]', marker=True, name="mark2") \
|
|
|
|
.regex("w.*?d", name="word") \
|
|
|
|
.rules(MarkerRule)
|
|
|
|
|
|
|
|
matches = rebulk.matches("[grab (word) only] if it's in parenthesis and brakets")
|
|
|
|
|
|
|
|
assert len(matches) == 1
|
|
|
|
assert matches[0].value == "word"
|
|
|
|
|
|
|
|
matches = rebulk.matches("[don't grab](word)[if brakets are outside]")
|
|
|
|
assert len(matches) == 0
|
|
|
|
|
|
|
|
matches = rebulk.matches("(grab w[or)d even] if it's partially in parenthesis and brakets")
|
|
|
|
assert len(matches) == 1
|
|
|
|
assert matches[0].value == "w[or)d"
|
|
|
|
|
|
|
|
def test_at_index_marker(self):
|
|
|
|
class MarkerRule(Rule):
|
|
|
|
def when(self, matches, context):
|
|
|
|
word_match = matches.named("word", 0)
|
|
|
|
marker = matches.markers.at_index(word_match.start,
|
|
|
|
lambda marker: marker.name == "mark1", 0)
|
|
|
|
if not marker:
|
|
|
|
return word_match
|
|
|
|
|
|
|
|
def then(self, matches, when_response, context):
|
|
|
|
matches.remove(when_response)
|
|
|
|
|
|
|
|
rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \
|
|
|
|
.regex("w.*?d", name="word") \
|
|
|
|
.rules(MarkerRule)
|
|
|
|
|
|
|
|
matches = rebulk.matches("gr(ab wo)rd only if starting of match is inside parenthesis")
|
|
|
|
|
|
|
|
assert len(matches) == 1
|
|
|
|
assert matches[0].value == "wo)rd"
|
|
|
|
|
|
|
|
matches = rebulk.matches("don't grab wo(rd if starting of match is not inside parenthesis")
|
|
|
|
|
|
|
|
assert len(matches) == 0
|
|
|
|
|
|
|
|
def test_remove_marker(self):
|
|
|
|
class MarkerRule(Rule):
|
|
|
|
def when(self, matches, context):
|
|
|
|
marker = matches.markers.named("mark1", 0)
|
|
|
|
if marker:
|
|
|
|
return marker
|
|
|
|
|
|
|
|
def then(self, matches, when_response, context):
|
|
|
|
matches.markers.remove(when_response)
|
|
|
|
|
|
|
|
rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \
|
|
|
|
.regex("w.*?d", name="word") \
|
|
|
|
.rules(MarkerRule)
|
|
|
|
|
|
|
|
matches = rebulk.matches("grab word event (if it's not) inside parenthesis")
|
|
|
|
|
|
|
|
assert len(matches) == 1
|
|
|
|
assert matches[0].value == "word"
|
|
|
|
|
|
|
|
assert not matches.markers
|
|
|
|
|
|
|
|
|
|
|
|
class TestUnicode:
|
|
|
|
def test_rebulk_simple(self):
|
|
|
|
input_string = "敏捷的棕色狐狸跳過懶狗"
|
|
|
|
|
|
|
|
rebulk = Rebulk()
|
|
|
|
|
|
|
|
rebulk.string("敏")
|
|
|
|
rebulk.regex("捷")
|
|
|
|
|
|
|
|
def func(input_string):
|
|
|
|
i = input_string.find("的")
|
|
|
|
if i > -1:
|
|
|
|
return i, i + len("的")
|
|
|
|
|
|
|
|
rebulk.functional(func)
|
|
|
|
|
|
|
|
matches = rebulk.matches(input_string)
|
|
|
|
assert len(matches) == 3
|
|
|
|
|
|
|
|
assert matches[0].value == "敏"
|
|
|
|
assert matches[1].value == "捷"
|
|
|
|
assert matches[2].value == "的"
|
|
|
|
|
|
|
|
|
|
|
|
class TestImmutable:
|
|
|
|
def test_starting(self):
|
|
|
|
input_string = "The quick brown fox jumps over the lazy dog"
|
|
|
|
matches = Rebulk().string("quick").string("over").string("fox").matches(input_string)
|
|
|
|
|
|
|
|
for i in range(0, len(input_string)):
|
|
|
|
starting = matches.starting(i)
|
|
|
|
for match in list(starting):
|
|
|
|
starting.remove(match)
|
|
|
|
|
|
|
|
assert len(matches) == 3
|
|
|
|
|
|
|
|
def test_ending(self):
|
|
|
|
input_string = "The quick brown fox jumps over the lazy dog"
|
|
|
|
matches = Rebulk().string("quick").string("over").string("fox").matches(input_string)
|
|
|
|
|
|
|
|
for i in range(0, len(input_string)):
|
|
|
|
starting = matches.ending(i)
|
|
|
|
for match in list(starting):
|
|
|
|
starting.remove(match)
|
|
|
|
|
|
|
|
assert len(matches) == 3
|
|
|
|
|
|
|
|
def test_named(self):
|
|
|
|
input_string = "The quick brown fox jumps over the lazy dog"
|
|
|
|
matches = Rebulk().defaults(name='test').string("quick").string("over").string("fox").matches(input_string)
|
|
|
|
|
|
|
|
named = matches.named('test')
|
|
|
|
for match in list(named):
|
|
|
|
named.remove(match)
|
|
|
|
|
|
|
|
assert len(named) == 0
|
|
|
|
assert len(matches) == 3
|