You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
239 lines
7.6 KiB
239 lines
7.6 KiB
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Processors
|
|
"""
|
|
from collections import defaultdict
|
|
import copy
|
|
|
|
import six
|
|
|
|
from rebulk import Rebulk, Rule, CustomRule, POST_PROCESS, PRE_PROCESS, AppendMatch, RemoveMatch
|
|
|
|
from .common import seps_no_groups
|
|
from .common.formatters import cleanup
|
|
from .common.comparators import marker_sorted
|
|
from .common.date import valid_year
|
|
from .common.words import iter_words
|
|
|
|
|
|
class EnlargeGroupMatches(CustomRule):
|
|
"""
|
|
Enlarge matches that are starting and/or ending group to include brackets in their span.
|
|
"""
|
|
priority = PRE_PROCESS
|
|
|
|
def when(self, matches, context):
|
|
starting = []
|
|
ending = []
|
|
|
|
for group in matches.markers.named('group'):
|
|
for match in matches.starting(group.start + 1):
|
|
starting.append(match)
|
|
|
|
for match in matches.ending(group.end - 1):
|
|
ending.append(match)
|
|
|
|
if starting or ending:
|
|
return starting, ending
|
|
|
|
def then(self, matches, when_response, context):
|
|
starting, ending = when_response
|
|
for match in starting:
|
|
matches.remove(match)
|
|
match.start -= 1
|
|
match.raw_start += 1
|
|
matches.append(match)
|
|
|
|
for match in ending:
|
|
matches.remove(match)
|
|
match.end += 1
|
|
match.raw_end -= 1
|
|
matches.append(match)
|
|
|
|
|
|
class EquivalentHoles(Rule):
|
|
"""
|
|
Creates equivalent matches for holes that have same values than existing (case insensitive)
|
|
"""
|
|
priority = POST_PROCESS
|
|
consequence = AppendMatch
|
|
|
|
def when(self, matches, context):
|
|
new_matches = []
|
|
|
|
for filepath in marker_sorted(matches.markers.named('path'), matches):
|
|
holes = matches.holes(start=filepath.start, end=filepath.end, formatter=cleanup)
|
|
for name in matches.names:
|
|
for hole in list(holes):
|
|
for current_match in matches.named(name):
|
|
if isinstance(current_match.value, six.string_types) and \
|
|
hole.value.lower() == current_match.value.lower():
|
|
if 'equivalent-ignore' in current_match.tags:
|
|
continue
|
|
new_value = _preferred_string(hole.value, current_match.value)
|
|
if hole.value != new_value:
|
|
hole.value = new_value
|
|
if current_match.value != new_value:
|
|
current_match.value = new_value
|
|
hole.name = name
|
|
hole.tags = ['equivalent']
|
|
new_matches.append(hole)
|
|
if hole in holes:
|
|
holes.remove(hole)
|
|
|
|
return new_matches
|
|
|
|
|
|
class RemoveAmbiguous(Rule):
|
|
"""
|
|
If multiple matches are found with same name and different values, keep the one in the most valuable filepart.
|
|
Also keep others match with same name and values than those kept ones.
|
|
"""
|
|
|
|
priority = POST_PROCESS
|
|
consequence = RemoveMatch
|
|
|
|
def __init__(self, sort_function=marker_sorted, predicate=None):
|
|
super(RemoveAmbiguous, self).__init__()
|
|
self.sort_function = sort_function
|
|
self.predicate = predicate
|
|
|
|
def when(self, matches, context):
|
|
fileparts = self.sort_function(matches.markers.named('path'), matches)
|
|
|
|
previous_fileparts_names = set()
|
|
values = defaultdict(list)
|
|
|
|
to_remove = []
|
|
for filepart in fileparts:
|
|
filepart_matches = matches.range(filepart.start, filepart.end, predicate=self.predicate)
|
|
|
|
filepart_names = set()
|
|
for match in filepart_matches:
|
|
filepart_names.add(match.name)
|
|
if match.name in previous_fileparts_names:
|
|
if match.value not in values[match.name]:
|
|
to_remove.append(match)
|
|
else:
|
|
if match.value not in values[match.name]:
|
|
values[match.name].append(match.value)
|
|
|
|
previous_fileparts_names.update(filepart_names)
|
|
|
|
return to_remove
|
|
|
|
|
|
class RemoveLessSpecificSeasonEpisode(RemoveAmbiguous):
|
|
"""
|
|
If multiple season/episodes matches are found with different values,
|
|
keep the one tagged as 'SxxExx' or in the rightmost filepart.
|
|
"""
|
|
def __init__(self, name):
|
|
super(RemoveLessSpecificSeasonEpisode, self).__init__(
|
|
sort_function=(lambda markers, matches:
|
|
marker_sorted(list(reversed(markers)), matches,
|
|
lambda match: match.name == name and 'SxxExx' in match.tags)),
|
|
predicate=lambda match: match.name == name)
|
|
|
|
|
|
def _preferred_string(value1, value2): # pylint:disable=too-many-return-statements
|
|
"""
|
|
Retrieves preferred title from both values.
|
|
:param value1:
|
|
:type value1: str
|
|
:param value2:
|
|
:type value2: str
|
|
:return: The preferred title
|
|
:rtype: str
|
|
"""
|
|
if value1 == value2:
|
|
return value1
|
|
if value1.istitle() and not value2.istitle():
|
|
return value1
|
|
if not value1.isupper() and value2.isupper():
|
|
return value1
|
|
if not value1.isupper() and value1[0].isupper() and not value2[0].isupper():
|
|
return value1
|
|
if _count_title_words(value1) > _count_title_words(value2):
|
|
return value1
|
|
return value2
|
|
|
|
|
|
def _count_title_words(value):
|
|
"""
|
|
Count only many words are titles in value.
|
|
:param value:
|
|
:type value:
|
|
:return:
|
|
:rtype:
|
|
"""
|
|
ret = 0
|
|
for word in iter_words(value):
|
|
if word.value.istitle():
|
|
ret += 1
|
|
return ret
|
|
|
|
|
|
class SeasonYear(Rule):
|
|
"""
|
|
If a season is a valid year and no year was found, create an match with year.
|
|
"""
|
|
priority = POST_PROCESS
|
|
consequence = AppendMatch
|
|
|
|
def when(self, matches, context):
|
|
ret = []
|
|
if not matches.named('year'):
|
|
for season in matches.named('season'):
|
|
if valid_year(season.value):
|
|
year = copy.copy(season)
|
|
year.name = 'year'
|
|
ret.append(year)
|
|
return ret
|
|
|
|
|
|
class Processors(CustomRule):
|
|
"""
|
|
Empty rule for ordering post_processing properly.
|
|
"""
|
|
priority = POST_PROCESS
|
|
|
|
def when(self, matches, context):
|
|
pass
|
|
|
|
def then(self, matches, when_response, context): # pragma: no cover
|
|
pass
|
|
|
|
|
|
class StripSeparators(CustomRule):
|
|
"""
|
|
Strip separators from matches. Keep separators if they are from acronyms, like in ".S.H.I.E.L.D."
|
|
"""
|
|
priority = POST_PROCESS
|
|
|
|
def when(self, matches, context):
|
|
return matches
|
|
|
|
def then(self, matches, when_response, context): # pragma: no cover
|
|
for match in matches:
|
|
for _ in range(0, len(match.span)):
|
|
if match.raw[0] in seps_no_groups and (len(match.raw) < 3 or match.raw[2] not in seps_no_groups):
|
|
match.raw_start += 1
|
|
|
|
for _ in reversed(range(0, len(match.span))):
|
|
if match.raw[-1] in seps_no_groups and (len(match.raw) < 3 or match.raw[-3] not in seps_no_groups):
|
|
match.raw_end -= 1
|
|
|
|
|
|
def processors():
|
|
"""
|
|
Builder for rebulk object.
|
|
:return: Created Rebulk object
|
|
:rtype: Rebulk
|
|
"""
|
|
return Rebulk().rules(EnlargeGroupMatches, EquivalentHoles,
|
|
RemoveLessSpecificSeasonEpisode('season'),
|
|
RemoveLessSpecificSeasonEpisode('episode'),
|
|
RemoveAmbiguous, SeasonYear, Processors, StripSeparators)
|