From 9ac3b0009b6db1dc74f8130dde830164323dbf89 Mon Sep 17 00:00:00 2001 From: panni Date: Wed, 28 Nov 2018 11:44:59 +0100 Subject: [PATCH] update subzero/subliminal_patch/fcache to current versions --- libs/fcache/cache.py | 55 ++++++++++++++----- libs/subliminal_patch/core.py | 3 +- libs/subliminal_patch/http.py | 11 +++- .../subliminal_patch/providers/hosszupuska.py | 9 +-- libs/subliminal_patch/providers/legendastv.py | 4 +- .../subliminal_patch/providers/napiprojekt.py | 2 + .../providers/opensubtitles.py | 8 ++- libs/subliminal_patch/providers/podnapisi.py | 11 ++-- libs/subliminal_patch/providers/subscenter.py | 2 + .../subliminal_patch/providers/tvsubtitles.py | 4 ++ libs/subliminal_patch/refiners/drone.py | 8 +-- libs/subliminal_patch/utils.py | 9 ++- libs/subliminal_patch/video.py | 2 + libs/subzero/language.py | 10 +++- .../subzero/modification/dictionaries/data.py | 32 +++++------ .../modification/dictionaries/make_data.py | 4 -- libs/subzero/modification/mods/common.py | 7 +-- libs/subzero/video.py | 12 +++- 18 files changed, 128 insertions(+), 65 deletions(-) diff --git a/libs/fcache/cache.py b/libs/fcache/cache.py index 3eaf62777..695f916c3 100644 --- a/libs/fcache/cache.py +++ b/libs/fcache/cache.py @@ -5,6 +5,7 @@ import pickle import shutil import tempfile import traceback +import hashlib import appdirs @@ -89,7 +90,7 @@ class FileCache(MutableMapping): """ def __init__(self, appname, flag='c', mode=0o666, keyencoding='utf-8', - serialize=True, app_cache_dir=None): + serialize=True, app_cache_dir=None, key_file_ext=".txt"): """Initialize a :class:`FileCache` object.""" if not isinstance(flag, str): raise TypeError("flag must be str not '{}'".format(type(flag))) @@ -130,6 +131,7 @@ class FileCache(MutableMapping): self._mode = mode self._keyencoding = keyencoding self._serialize = serialize + self.key_file_ext = key_file_ext def _parse_appname(self, appname): """Splits an appname into the appname and subcache components.""" @@ -188,6 +190,11 @@ class FileCache(MutableMapping): except: logger.error("Couldn't write content from %r to cache file: %r: %s", ekey, filename, traceback.format_exc()) + try: + self.__write_to_file(filename + self.key_file_ext, ekey) + except: + logger.error("Couldn't write content from %r to cache file: %r: %s", ekey, filename, + traceback.format_exc()) self._buffer.clear() self._sync = False @@ -196,8 +203,7 @@ class FileCache(MutableMapping): raise ValueError("invalid operation on closed cache") def _encode_key(self, key): - """Encode key using *hex_codec* for constructing a cache filename. - + """ Keys are implicitly converted to :class:`bytes` if passed as :class:`str`. @@ -206,16 +212,15 @@ class FileCache(MutableMapping): key = key.encode(self._keyencoding) elif not isinstance(key, bytes): raise TypeError("key must be bytes or str") - return codecs.encode(key, 'hex_codec').decode(self._keyencoding) + return key.decode(self._keyencoding) def _decode_key(self, key): - """Decode key using hex_codec to retrieve the original key. - + """ Keys are returned as :class:`str` if serialization is enabled. Keys are returned as :class:`bytes` if serialization is disabled. """ - bkey = codecs.decode(key.encode(self._keyencoding), 'hex_codec') + bkey = key.encode(self._keyencoding) return bkey.decode(self._keyencoding) if self._serialize else bkey def _dumps(self, value): @@ -226,18 +231,24 @@ class FileCache(MutableMapping): def _key_to_filename(self, key): """Convert an encoded key to an absolute cache filename.""" - return os.path.join(self.cache_dir, key) + if isinstance(key, unicode): + key = key.encode(self._keyencoding) + return os.path.join(self.cache_dir, hashlib.md5(key).hexdigest()) def _filename_to_key(self, absfilename): """Convert an absolute cache filename to a key name.""" - return os.path.split(absfilename)[1] + hkey_hdr_fn = absfilename + self.key_file_ext + if os.path.isfile(hkey_hdr_fn): + with open(hkey_hdr_fn, 'rb') as f: + key = f.read() + return key.decode(self._keyencoding) if self._serialize else key def _all_filenames(self, scandir_generic=True): """Return a list of absolute cache filenames""" _scandir = _scandir_generic if scandir_generic else scandir try: for entry in _scandir(self.cache_dir): - if entry.is_file(follow_symlinks=False): + if entry.is_file(follow_symlinks=False) and not entry.name.endswith(self.key_file_ext): yield os.path.join(self.cache_dir, entry.name) except (FileNotFoundError, OSError): raise StopIteration @@ -250,14 +261,17 @@ class FileCache(MutableMapping): else: return set(file_keys + list(self._buffer)) - def _write_to_file(self, filename, bytesvalue): + def __write_to_file(self, filename, value): """Write bytesvalue to filename.""" fh, tmp = tempfile.mkstemp() with os.fdopen(fh, self._flag) as f: - f.write(self._dumps(bytesvalue)) + f.write(value) rename(tmp, filename) os.chmod(filename, self._mode) + def _write_to_file(self, filename, bytesvalue): + self.__write_to_file(filename, self._dumps(bytesvalue)) + def _read_from_file(self, filename): """Read data from filename.""" try: @@ -274,6 +288,7 @@ class FileCache(MutableMapping): else: filename = self._key_to_filename(ekey) self._write_to_file(filename, value) + self.__write_to_file(filename + self.key_file_ext, ekey) def __getitem__(self, key): ekey = self._encode_key(key) @@ -283,8 +298,9 @@ class FileCache(MutableMapping): except KeyError: pass filename = self._key_to_filename(ekey) - if filename not in self._all_filenames(): + if not os.path.isfile(filename): raise KeyError(key) + return self._read_from_file(filename) def __delitem__(self, key): @@ -301,6 +317,11 @@ class FileCache(MutableMapping): except (IOError, OSError): pass + try: + os.remove(filename + self.key_file_ext) + except (IOError, OSError): + pass + def __iter__(self): for key in self._all_keys(): yield self._decode_key(key) @@ -310,4 +331,10 @@ class FileCache(MutableMapping): def __contains__(self, key): ekey = self._encode_key(key) - return ekey in self._all_keys() + if not self._sync: + try: + return ekey in self._buffer + except KeyError: + pass + filename = self._key_to_filename(ekey) + return os.path.isfile(filename) diff --git a/libs/subliminal_patch/core.py b/libs/subliminal_patch/core.py index 5f3158195..28e3df5ad 100644 --- a/libs/subliminal_patch/core.py +++ b/libs/subliminal_patch/core.py @@ -514,8 +514,7 @@ def scan_video(path, dont_use_actual_file=False, hints=None, providers=None, ski # guess hints["single_value"] = True - if video_type == "movie": - hints["expected_title"] = [hints["title"]] + hints["expected_title"] = [hints["title"]] guessed_result = guessit(guess_from, options=hints) logger.debug('GuessIt found: %s', json.dumps(guessed_result, cls=GuessitEncoder, indent=4, ensure_ascii=False)) diff --git a/libs/subliminal_patch/http.py b/libs/subliminal_patch/http.py index d7d1310ff..d6fddb358 100644 --- a/libs/subliminal_patch/http.py +++ b/libs/subliminal_patch/http.py @@ -16,9 +16,7 @@ from exceptions import APIThrottled from subzero.lib.io import get_viable_encoding logger = logging.getLogger(__name__) -pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(unicode(__file__, get_viable_encoding()))), - "..", "..", certifi.where())) - +pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(unicode(__file__, get_viable_encoding()))), "..", certifi.where())) try: default_ssl_context = ssl.create_default_context(cafile=pem_file) except AttributeError: @@ -33,10 +31,17 @@ custom_resolver.nameservers = ['8.8.8.8', '1.1.1.1'] class CertifiSession(Session): + timeout = 10 + def __init__(self): super(CertifiSession, self).__init__() self.verify = pem_file + def request(self, *args, **kwargs): + if kwargs.get('timeout') is None: + kwargs['timeout'] = self.timeout + return super(CertifiSession, self).request(*args, **kwargs) + class RetryingSession(CertifiSession): proxied_functions = ("get", "post") diff --git a/libs/subliminal_patch/providers/hosszupuska.py b/libs/subliminal_patch/providers/hosszupuska.py index 2552f1b78..96c5bce71 100644 --- a/libs/subliminal_patch/providers/hosszupuska.py +++ b/libs/subliminal_patch/providers/hosszupuska.py @@ -33,8 +33,9 @@ def fix_inconsistent_naming(title): :rtype: str """ - return _fix_inconsistent_naming(title, {"DC's Legends of Tomorrow": "Legends of Tomorrow", - "Marvel's Jessica Jones": "Jessica Jones"}) + return _fix_inconsistent_naming(title, {"Stargate Origins": "Stargate: Origins", + "Marvel's Agents of S.H.I.E.L.D.": "Marvels+Agents+of+S.H.I.E.L.D", + "Mayans M.C.": "Mayans MC"}, True ) logger = logging.getLogger(__name__) @@ -89,7 +90,7 @@ class HosszupuskaSubtitle(Subtitle): def get_matches(self, video): matches = set() # series - if video.series and sanitize(self.series) == sanitize(video.series): + if video.series and ( sanitize(self.series) == sanitize(fix_inconsistent_naming(video.series)) or sanitize(self.series) == sanitize(video.series)): matches.add('series') # season if video.season and self.season == video.season: @@ -150,7 +151,7 @@ class HosszupuskaProvider(Provider, ProviderSubtitleArchiveMixin): seasona = "%02d" % season episodea = "%02d" % episode series = fix_inconsistent_naming(series) - seriesa = series.replace(' ', '+').replace('\'', '') + seriesa = series.replace(' ', '+') # get the episode page logger.info('Getting the page for episode %s', episode) diff --git a/libs/subliminal_patch/providers/legendastv.py b/libs/subliminal_patch/providers/legendastv.py index cffde9064..0661b5f4d 100644 --- a/libs/subliminal_patch/providers/legendastv.py +++ b/libs/subliminal_patch/providers/legendastv.py @@ -7,7 +7,8 @@ from subliminal.exceptions import ConfigurationError from subliminal.providers.legendastv import LegendasTVSubtitle as _LegendasTVSubtitle, \ LegendasTVProvider as _LegendasTVProvider, Episode, Movie, guess_matches, guessit, sanitize, region, type_map, \ raise_for_status, json, SHOW_EXPIRATION_TIME, title_re, season_re, datetime, pytz, NO_VALUE, releases_key, \ - SUBTITLE_EXTENSIONS + SUBTITLE_EXTENSIONS, language_converters +from subzero.language import Language logger = logging.getLogger(__name__) @@ -63,6 +64,7 @@ class LegendasTVSubtitle(_LegendasTVSubtitle): class LegendasTVProvider(_LegendasTVProvider): + languages = {Language(*l) for l in language_converters['legendastv'].to_legendastv.keys()} subtitle_class = LegendasTVSubtitle def __init__(self, username=None, password=None): diff --git a/libs/subliminal_patch/providers/napiprojekt.py b/libs/subliminal_patch/providers/napiprojekt.py index e4fefae46..0b80f821d 100644 --- a/libs/subliminal_patch/providers/napiprojekt.py +++ b/libs/subliminal_patch/providers/napiprojekt.py @@ -3,6 +3,7 @@ import logging from subliminal.providers.napiprojekt import NapiProjektProvider as _NapiProjektProvider, \ NapiProjektSubtitle as _NapiProjektSubtitle, get_subhash +from subzero.language import Language logger = logging.getLogger(__name__) @@ -18,6 +19,7 @@ class NapiProjektSubtitle(_NapiProjektSubtitle): class NapiProjektProvider(_NapiProjektProvider): + languages = {Language.fromalpha2(l) for l in ['pl']} subtitle_class = NapiProjektSubtitle def query(self, language, hash): diff --git a/libs/subliminal_patch/providers/opensubtitles.py b/libs/subliminal_patch/providers/opensubtitles.py index 23e82c833..376c8ce19 100644 --- a/libs/subliminal_patch/providers/opensubtitles.py +++ b/libs/subliminal_patch/providers/opensubtitles.py @@ -163,12 +163,13 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider): token = region.get("os_token", expiration_time=3600) if token is not NO_VALUE: try: - logger.debug('Trying previous token') + logger.debug('Trying previous token: %r', token[:10]+"X"*(len(token)-10)) checked(lambda: self.server.NoOperation(token)) self.token = token - logger.debug("Using previous login token: %s", self.token) + logger.debug("Using previous login token: %r", token[:10]+"X"*(len(token)-10)) return except: + logger.debug('Token not valid.') pass try: @@ -299,6 +300,9 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider): elif also_foreign and foreign_parts_only: language = Language.rebuild(language, forced=True) + if language not in languages: + continue + query_parameters = _subtitle_item.get("QueryParameters") subtitle = self.subtitle_class(language, hearing_impaired, page_link, subtitle_id, matched_by, diff --git a/libs/subliminal_patch/providers/podnapisi.py b/libs/subliminal_patch/providers/podnapisi.py index 128973ea7..f55d08429 100644 --- a/libs/subliminal_patch/providers/podnapisi.py +++ b/libs/subliminal_patch/providers/podnapisi.py @@ -175,7 +175,7 @@ class PodnapisiProvider(_PodnapisiProvider, ProviderSubtitleArchiveMixin): if pid in pids: continue - language = Language.fromietf(subtitle_xml.find('language').text) + _language = Language.fromietf(subtitle_xml.find('language').text) hearing_impaired = 'n' in (subtitle_xml.find('flags').text or '') foreign = 'f' in (subtitle_xml.find('flags').text or '') if only_foreign and not foreign: @@ -185,7 +185,10 @@ class PodnapisiProvider(_PodnapisiProvider, ProviderSubtitleArchiveMixin): continue elif also_foreign and foreign: - language = Language.rebuild(language, forced=True) + _language = Language.rebuild(_language, forced=True) + + if language != _language: + continue page_link = subtitle_xml.find('url').text releases = [] @@ -198,12 +201,12 @@ class PodnapisiProvider(_PodnapisiProvider, ProviderSubtitleArchiveMixin): r_year = int(subtitle_xml.find('year').text) if is_episode: - subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title, + subtitle = self.subtitle_class(_language, hearing_impaired, page_link, pid, releases, title, season=r_season, episode=r_episode, year=r_year, asked_for_release_group=video.release_group, asked_for_episode=episode) else: - subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title, + subtitle = self.subtitle_class(_language, hearing_impaired, page_link, pid, releases, title, year=r_year, asked_for_release_group=video.release_group) diff --git a/libs/subliminal_patch/providers/subscenter.py b/libs/subliminal_patch/providers/subscenter.py index a8b9844b4..5626c9935 100644 --- a/libs/subliminal_patch/providers/subscenter.py +++ b/libs/subliminal_patch/providers/subscenter.py @@ -2,6 +2,7 @@ from subliminal.providers.subscenter import SubsCenterProvider as _SubsCenterProvider, \ SubsCenterSubtitle as _SubsCenterSubtitle +from subzero.language import Language class SubsCenterSubtitle(_SubsCenterSubtitle): @@ -21,6 +22,7 @@ class SubsCenterSubtitle(_SubsCenterSubtitle): class SubsCenterProvider(_SubsCenterProvider): + languages = {Language.fromalpha2(l) for l in ['he']} subtitle_class = SubsCenterSubtitle hearing_impaired_verifiable = True server_url = 'http://www.subscenter.info/he/' diff --git a/libs/subliminal_patch/providers/tvsubtitles.py b/libs/subliminal_patch/providers/tvsubtitles.py index d09a6adc5..7f359772d 100644 --- a/libs/subliminal_patch/providers/tvsubtitles.py +++ b/libs/subliminal_patch/providers/tvsubtitles.py @@ -21,6 +21,10 @@ class TVsubtitlesSubtitle(_TVsubtitlesSubtitle): class TVsubtitlesProvider(_TVsubtitlesProvider): + languages = {Language('por', 'BR')} | {Language(l) for l in [ + 'ara', 'bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'fin', 'fra', 'hun', 'ita', 'jpn', 'kor', 'nld', 'pol', 'por', + 'ron', 'rus', 'spa', 'swe', 'tur', 'ukr', 'zho' + ]} subtitle_class = TVsubtitlesSubtitle @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) diff --git a/libs/subliminal_patch/refiners/drone.py b/libs/subliminal_patch/refiners/drone.py index d0a5cb628..ae96e26c9 100644 --- a/libs/subliminal_patch/refiners/drone.py +++ b/libs/subliminal_patch/refiners/drone.py @@ -63,12 +63,12 @@ class DroneAPIClient(object): out[key] = quote(value) return out - def get(self, endpoint, **params): + def get(self, endpoint, requests_kwargs=None, **params): url = urljoin(self.api_url, endpoint) params = self.build_params(params) # perform the request - r = self.session.get(url, params=params) + r = self.session.get(url, params=params, **(requests_kwargs or {})) r.raise_for_status() # get the response as json @@ -79,8 +79,8 @@ class DroneAPIClient(object): return j return [] - def status(self): - return self.get("system/status") + def status(self, **kwargs): + return self.get("system/status", requests_kwargs=kwargs) def update_video(self, video, scene_name): """ diff --git a/libs/subliminal_patch/utils.py b/libs/subliminal_patch/utils.py index d439272b0..17720334e 100644 --- a/libs/subliminal_patch/utils.py +++ b/libs/subliminal_patch/utils.py @@ -35,11 +35,12 @@ def sanitize(string, ignore_characters=None, default_characters={'-', ':', '(', return string.strip().lower() -def fix_inconsistent_naming(title, inconsistent_titles_dict=None): +def fix_inconsistent_naming(title, inconsistent_titles_dict=None, no_sanitize=False): """Fix titles with inconsistent naming using dictionary and sanitize them. :param str title: original title. :param dict inconsistent_titles_dict: dictionary of titles with inconsistent naming. + :param bool no_sanitize: indication to not sanitize title. :return: new title. :rtype: str @@ -54,5 +55,9 @@ def fix_inconsistent_naming(title, inconsistent_titles_dict=None): pattern = re.compile('|'.join(re.escape(key) for key in inconsistent_titles_dict.keys())) title = pattern.sub(lambda x: inconsistent_titles_dict[x.group()], title) + if no_sanitize: + return title + else: + return sanitize(title) # return fixed and sanitized title - return sanitize(title) + return title diff --git a/libs/subliminal_patch/video.py b/libs/subliminal_patch/video.py index df615c849..a53b09ead 100644 --- a/libs/subliminal_patch/video.py +++ b/libs/subliminal_patch/video.py @@ -12,6 +12,7 @@ class Video(Video_): hints = None season_fully_aired = None audio_languages = None + external_subtitle_languages = None def __init__(self, name, format=None, release_group=None, resolution=None, video_codec=None, audio_codec=None, imdb_id=None, hashes=None, size=None, subtitle_languages=None, audio_languages=None): @@ -22,3 +23,4 @@ class Video(Video_): self.plexapi_metadata = {} self.hints = {} self.audio_languages = audio_languages or set() + self.external_subtitle_languages = set() diff --git a/libs/subzero/language.py b/libs/subzero/language.py index af486c4ed..f520f6dad 100644 --- a/libs/subzero/language.py +++ b/libs/subzero/language.py @@ -1,6 +1,7 @@ # coding=utf-8 -from babelfish.exceptions import LanguageError +import types +from babelfish.exceptions import LanguageError from babelfish import Language as Language_, basestr @@ -34,7 +35,12 @@ def wrap_forced(f): cls = args[0] args = args[1:] s = args.pop(0) - base, forced = s.split(":") if ":" in s else (s, False) + forced = None + if isinstance(s, types.StringTypes): + base, forced = s.split(":") if ":" in s else (s, False) + else: + base = s + instance = f(cls, base, *args, **kwargs) if isinstance(instance, Language): instance.forced = forced == "forced" diff --git a/libs/subzero/modification/dictionaries/data.py b/libs/subzero/modification/dictionaries/data.py index 2ec9253f7..7524cbc04 100644 --- a/libs/subzero/modification/dictionaries/data.py +++ b/libs/subzero/modification/dictionaries/data.py @@ -6,7 +6,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict([(u'da nadjem', u'na\u0107i'), (u'da nadjes', u'na\u0107i'), (u'da budes', u'biti'), (u'da ides', u'i\u0107i'), (u'da prodemo', u'pro\u0107i'), (u'da udem', u'u\u0107i'), (u'gdje ides', u'kamo ide\u0161'), (u'Gdje ides', u'Kamo ide\u0161'), (u'hocu da budem', u'\u017eelim biti'), (u'Hocu da budem', u'\u017delim biti'), (u'hocu da kazem', u'\u017eelim re\u0107i'), (u'hoces da kazes', u'\u017eeli\u0161 re\u0107i'), (u'hoce da kaze', u'\u017eeli re\u0107i'), (u'kao sto sam', u'kao \u0161to sam'), (u'me leda', u'me le\u0111a'), (u'medu nama', u'me\u0111u nama'), (u'moramo da idemo', u'moramo i\u0107i'), (u'moras da ides', u'mora\u0161 i\u0107i'), (u'na vecer', u'nave\u010der'), (u'Na vecer', u'Nave\u010der'), (u'ne cu', u'ne\u0107u'), (u'ne ces', u'ne\u0107e\u0161'), (u'ne\u0161to sto', u'ne\u0161to \u0161to'), (u'ono sto', u'ono \u0161to'), (u'Ono sto', u'Ono \u0161to'), (u'reci \u0107u', u're\u0107i \u0107u'), (u'sto ti se ne', u'\u0161to ti se ne'), (u'sto vise', u'\u0161to vi\u0161e'), (u'sve sto', u'sve \u0161to'), (u'Zao mi', u'\u017dao mi'), (u'zao mi', u'\u017eao mi'), (u'Zato sto', u'Zato \u0161to'), (u'zato sto', u'zato \u0161to'), (u'znas sto', u'zna\u0161 \u0161to'), (u'zna\u0161 sto', u'zna\u0161 \u0161to')]), 'pattern': u'(?um)(?:(?<=\\s)|(?<=^)|(?<=\\b))(?:da\\ nadjem|da\\ nadjes|da\\ budes|da\\ ides|da\\ prodemo|da\\ udem|gdje\\ ides|Gdje\\ ides|hocu\\ da\\ budem|Hocu\\ da\\ budem|hocu\\ da\\ kazem|hoces\\ da\\ kazes|hoce\\ da\\ kaze|kao\\ sto\\ sam|me\\ leda|medu\\ nama|moramo\\ da\\ idemo|moras\\ da\\ ides|na\\ vecer|Na\\ vecer|ne\\ cu|ne\\ ces|ne\\\u0161to\\ sto|ono\\ sto|Ono\\ sto|reci\\ \\\u0107u|sto\\ ti\\ se\\ ne|sto\\ vise|sve\\ sto|Zao\\ mi|zao\\ mi|Zato\\ sto|zato\\ sto|znas\\ sto|zna\\\u0161\\ sto)(?:(?=\\s)|(?=$)|(?=\\b))'}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict(), 'pattern': None}, 'WholeLines': {'data': OrderedDict(), 'pattern': None}, @@ -18,7 +18,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict(), 'pattern': None}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\xa4', u'o'), (u'IVI', u'M'), (u'lVI', u'M'), (u'IVl', u'M'), (u'lVl', u'M'), (u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict([(u'\xa4', u'o'), (u'IVI', u'M'), (u'lVI', u'M'), (u'IVl', u'M'), (u'lVl', u'M')]), 'pattern': None}, 'WholeLines': {'data': OrderedDict(), 'pattern': None}, @@ -30,7 +30,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict(), 'pattern': None}, - 'PartialWordsAlways': {'data': OrderedDict([(u'IVI', u'M'), (u'IVl', u'M'), (u'I\\/I', u'M'), (u'I\\/l', u'M'), (u'lVI', u'M'), (u'lVl', u'M'), (u'l\\/I', u'M'), (u'l\\/l', u'M'), (u'\xa4', u'o'), (u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict([(u'IVI', u'M'), (u'IVl', u'M'), (u'I\\/I', u'M'), (u'I\\/l', u'M'), (u'lVI', u'M'), (u'lVl', u'M'), (u'l\\/I', u'M'), (u'l\\/l', u'M'), (u'\xa4', u'o')]), 'pattern': None}, 'WholeLines': {'data': OrderedDict(), 'pattern': None}, @@ -42,7 +42,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': u"(?um)(?:\\,\\ sin|\\ mothen|\\ can\\'t\\_|\\ openiL|\\ of\\\ufb02|pshycol|\\ i\\.\\.\\.|\\ L\\.)$"}, 'PartialLines': {'data': OrderedDict([(u' /be ', u' I be '), (u" aren '1'", u" aren't"), (u" aren'tyou", u" aren't you"), (u" doesn '1'", u" doesn't"), (u" fr/eno'", u' friend'), (u" fr/eno'.", u' friend.'), (u" haven 'z' ", u" haven't "), (u" haven 'z'.", u" haven't."), (u' I ha ve ', u' I have '), (u" I']I ", u" I'll "), (u' L am', u' I am'), (u' L can', u' I can'), (u" L don't ", u" I don't "), (u' L hate ', u' I hate '), (u' L have ', u' I have '), (u' L like ', u' I like '), (u' L will', u' I will'), (u' L would', u' I would'), (u" L'll ", u" I'll "), (u" L've ", u" I've "), (u' m y family', u' my family'), (u" 's ", u"'s "), (u" shou/dn '1 ", u" shouldn't "), (u" won 'z' ", u" won't "), (u" won 'z'.", u" won't."), (u" wou/c/n 'z' ", u" wouldn't "), (u" wou/c/n 'z'.", u" wouldn't."), (u" wou/dn 'z' ", u" wouldn't "), (u" wou/dn 'z'.", u" wouldn't."), (u'/ did', u'I did'), (u'/ have ', u'I have '), (u'/ just ', u'I just '), (u'/ loved ', u'I loved '), (u'/ need', u'I need'), (u'|was11', u'I was 11'), (u'at Hrst', u'at first'), (u"B ullshiz'", u'Bullshit'), (u'big lunk', u'love you'), (u"can 't", u"can't"), (u"can' t ", u"can't "), (u"can 't ", u"can't "), (u'CHA TTERING', u'CHATTERING'), (u'come 0n', u'come on'), (u'Come 0n', u'Come on'), (u"couldn 't", u"couldn't"), (u"couldn' t ", u"couldn't "), (u"couldn 't ", u"couldn't "), (u"Destin y's", u"Destiny's"), (u"didn 't", u"didn't"), (u"didn' t ", u"didn't "), (u"didn 't ", u"didn't "), (u"Doesn '1'", u"Doesn't"), (u"doesn '1' ", u"doesn't "), (u"doesn '1\u2018 ", u"doesn't "), (u"doesn 't", u"doesn't"), (u"doesn'1' ", u"doesn't "), (u"doesn'1\u2018 ", u"doesn't "), (u"don '1' ", u"don't "), (u"don '1\u2018 ", u"don't "), (u"don '2' ", u"don't "), (u" aren '2'", u" aren't"), (u"aren '2' ", u"aren't "), (u"don '2\u2018 ", u"don't "), (u"don 't", u"don't"), (u"Don' t ", u"Don't "), (u"Don 't ", u"Don't "), (u"don'1' ", u"don't "), (u"don'1\u2018 ", u"don't "), (u"there '5 ", u"there's "), (u'E very', u'Every'), (u'get 0n', u'get on'), (u'go 0n', u'go on'), (u'Go 0n', u'Go on'), (u"H3993' birthday", u'Happy birthday'), (u"hadn 't", u"hadn't"), (u"he 's", u"he's"), (u"He 's", u"He's"), (u'He y', u'Hey'), (u'he)/', u'hey'), (u'He)/', u'Hey'), (u'HEA VY', u'HEAVY'), (u'Henry ll', u'Henry II'), (u'Henry lll', u'Henry III'), (u'Henry Vlll', u'Henry VIII'), (u'Henry Vll', u'Henry VII'), (u'Henry Vl', u'Henry VI'), (u'Hold 0n', u'Hold on'), (u'I am. ls', u'I am. Is'), (u'I d0', u'I do'), (u"I 'm", u"I'm"), (u"I 'rn ", u"I'm "), (u"I 've", u"I've"), (u'I0 ve her', u'love her'), (u'I0 ve you', u'love you'), (u"I02'", u'lot'), (u"I'm sony", u"I'm sorry"), (u"isn' t ", u"isn't "), (u"isn 't ", u"isn't "), (u'K)/le', u'Kyle'), (u'L ook', u'Look'), (u'let me 90', u'let me go'), (u'Let me 90', u'Let me go'), (u"let's 90", u"let's go"), (u"Let's 90", u"Let's go"), (u'lfl had', u'If I had'), (u'lova you', u'love you'), (u'Lova you', u'love you'), (u'lovo you', u'love you'), (u'Lovo you', u'love you'), (u'ls anyone', u'Is anyone'), (u'ls he', u'Is he'), (u'-ls he', u'- Is he'), (u'ls it', u'Is it'), (u'-ls it', u'- Is it'), (u'ls she', u'Is she'), (u'-ls she', u'- Is she'), (u'ls that', u'Is that'), (u'-ls that', u'- Is that'), (u'ls this', u'Is this'), (u'-ls this', u'- Is this'), (u'Maze] tov', u'Mazel tov'), (u"N02' ", u'Not '), (u' of 0ur ', u' of our '), (u' ot mine ', u' of mine '), (u'PLA YING', u'PLAYING'), (u'REPEA TING ', u'REPEATING '), (u'Sa y', u'Say'), (u"she 's", u"she's"), (u"She 's", u"She's"), (u"shouldn 't", u"shouldn't"), (u'sta y', u'stay'), (u'Sta y', u'Stay'), (u'SWO rd', u'Sword'), (u'taka care', u'take care'), (u'Taka care', u'Take care'), (u'the Hrst', u'the first'), (u'toc late', u'too late'), (u'uf me', u'of me'), (u'uf our', u'of our'), (u'wa y', u'way'), (u'Wal-I\\/Iart', u'Wal-Mart'), (u"wasn '1' ", u"wasn't "), (u"Wasn '1' ", u"Wasn't "), (u"wasn '1\u2018 ", u"wasn't "), (u"Wasn '1\u2018 ", u"Wasn't "), (u"wasn 't", u"wasn't"), (u"Wasn 't", u"Wasn't"), (u"we 've", u"we've"), (u"We 've", u"We've"), (u"wem' off", u'went off'), (u"weren 't", u"weren't"), (u"who 's", u"who's"), (u"won 't", u"won't"), (u'would ha ve', u'would have '), (u"wouldn 't", u"wouldn't"), (u"Wouldn 't", u"Wouldn't"), (u'y()u', u'you'), (u'you QUYS', u'you guys'), (u"you' re ", u"you're "), (u"you 're ", u"you're "), (u"you 've", u"you've"), (u"You 've", u"You've"), (u"you' ve ", u"you've "), (u"you 've ", u"you've "), (u'aftera while', u'after a while'), (u'Aftera while', u'After a while'), (u'THUN DERCLAPS', u'THUNDERCLAPS'), (u'(BUZZI N G)', u'(BUZZING)'), (u'[BUZZI N G]', u'[BUZZING]'), (u'(G RU NTING', u'(GRUNTING'), (u'[G RU NTING', u'[GRUNTING'), (u'(G ROWLING', u'(GROWLING'), (u'[G ROWLING', u'[GROWLING'), (u' WAI LS)', u'WAILS)'), (u' WAI LS]', u'WAILS]'), (u'(scu RRYING)', u'(SCURRYING)'), (u'[scu RRYING]', u'[SCURRYING]'), (u'(GRUNT5)', u'(GRUNTS)'), (u'[GRUNT5]', u'[GRUNTS]'), (u'NARRA TOR:', u'NARRATOR:'), (u'(GROAN ING', u'(GROANING'), (u'[GROAN ING', u'[GROANING'), (u'GROAN ING)', u'GROANING)'), (u'GROAN ING]', u'GROANING]'), (u'(LAUGH ING', u'(LAUGHING'), (u'[LAUGH ING', u'[LAUGHING'), (u'LAUGH ING)', u'LAUGHING)'), (u'LAUGH ING]', u'LAUGHING]'), (u'(BU BBLING', u'(BUBBLING'), (u'[BU BBLING', u'[BUBBLING'), (u'BU BBLING)', u'BUBBLING)'), (u'BU BBLING]', u'BUBBLING]'), (u'(SH USHING', u'(SHUSHING'), (u'[SH USHING', u'[SHUSHING'), (u'SH USHING)', u'SHUSHING)'), (u'SH USHING]', u'SHUSHING]'), (u'(CH ILDREN', u'(CHILDREN'), (u'[CH ILDREN', u'[CHILDREN'), (u'CH ILDREN)', u'CHILDREN)'), (u'CH ILDREN]', u'CHILDREN]'), (u'(MURMU RING', u'(MURMURING'), (u'[MURMU RING', u'[MURMURING'), (u'MURMU RING)', u'MURMURING)'), (u'MURMU RING]', u'MURMURING]'), (u'(GU N ', u'(GUN '), (u'[GU N ', u'[GUN '), (u'GU N)', u'GUN)'), (u'GU N]', u'GUN]'), (u'CH ILDREN:', u'CHILDREN:'), (u'STU DENTS:', u'STUDENTS:'), (u'(WH ISTLE', u'(WHISTLE'), (u'[WH ISTLE', u'[WHISTLE'), (u'WH ISTLE)', u'WHISTLE)'), (u'WH ISTLE]', u'WHISTLE]'), (u'U LU LATING', u'ULULATING'), (u'AU DIENCE:', u'AUDIENCE:'), (u'HA WAIIAN', u'HAWAIIAN'), (u'(ARTH UR', u'(ARTHUR'), (u'[ARTH UR', u'[ARTHUR'), (u'ARTH UR)', u'ARTHUR)'), (u'ARTH UR]', u'ARTHUR]'), (u'J EREMY:', u'JEREMY:'), (u'(ELEVA TOR', u'(ELEVATOR'), (u'[ELEVA TOR', u'[ELEVATOR'), (u'ELEVA TOR)', u'ELEVATOR)'), (u'ELEVA TOR]', u'ELEVATOR]'), (u'CONTIN U ES', u'CONTINUES'), (u'WIN D HOWLING', u'WIND HOWLING'), (u'telis me', u'tells me'), (u'Telis me', u'Tells me'), (u'. Ls ', u'. Is '), (u'! Ls ', u'! Is '), (u'? Ls ', u'? Is '), (u'. Lt ', u'. It '), (u'! Lt ', u'! It '), (u'? Lt ', u'? It '), (u'SQMEWH ERE ELSE', u'SOMEWHERE ELSE'), (u' I,m ', u" I'm "), (u' I,ve ', u" I've "), (u' you,re ', u" you're "), (u' you,ll ', u" you'll "), (u' doesn,t ', u" doesn't "), (u' let,s ', u" let's "), (u' he,s ', u" he's "), (u' it,s ', u" it's "), (u' can,t ', u" can't "), (u' Can,t ', u" Can't "), (u' don,t ', u" don't "), (u' Don,t ', u" Don't "), (u"wouldn 'tyou", u"wouldn't you"), (u' lgot it', u' I got it'), (u' you,ve ', u" you've "), (u' I ve ', u" I've "), (u' I ii ', u" I'll "), (u' I m ', u" I'm "), (u' why d ', u" why'd "), (u' couldn t ', u" couldn't "), (u' that s ', u" that's "), (u' i... ', u' I... '), (u"L don't", u"I don't"), (u"L won't", u"I won't"), (u'L should', u'I should'), (u'L had', u'I had'), (u'L happen', u'I happen'), (u"L wasn't", u"I wasnt't"), (u'H i', u'Hi'), (u"L didn't", u"I didn't"), (u'L do', u'I do'), (u'L could', u'I could'), (u'L will', u'I will'), (u'L suggest', u'I suggest'), (u'L reckon', u'I reckon'), (u'L am', u'I am'), (u"L couldn't", u"I couldn't"), (u'L might', u'I might'), (u'L would', u'I would'), (u'L was', u'I was'), (u'L know', u'I know'), (u'L think', u'I think'), (u"L haven't", u"I haven't"), (u'L have ', u'I have'), (u'L want', u'I want'), (u'L can', u'I can'), (u'L love', u'I love'), (u'L like', u'I like')]), 'pattern': u"(?um)(?:(?<=\\s)|(?<=^)|(?<=\\b))(?:\\ \\/be\\ |\\ aren\\ \\'1\\'|\\ aren\\'tyou|\\ doesn\\ \\'1\\'|\\ fr\\/eno\\'|\\ fr\\/eno\\'\\.|\\ haven\\ \\'z\\'\\ |\\ haven\\ \\'z\\'\\.|\\ I\\ ha\\ ve\\ |\\ I\\'\\]I\\ |\\ L\\ am|\\ L\\ can|\\ L\\ don\\'t\\ |\\ L\\ hate\\ |\\ L\\ have\\ |\\ L\\ like\\ |\\ L\\ will|\\ L\\ would|\\ L\\'ll\\ |\\ L\\'ve\\ |\\ m\\ y\\ family|\\ \\'s\\ |\\ shou\\/dn\\ \\'1\\ |\\ won\\ \\'z\\'\\ |\\ won\\ \\'z\\'\\.|\\ wou\\/c\\/n\\ \\'z\\'\\ |\\ wou\\/c\\/n\\ \\'z\\'\\.|\\ wou\\/dn\\ \\'z\\'\\ |\\ wou\\/dn\\ \\'z\\'\\.|\\/\\ did|\\/\\ have\\ |\\/\\ just\\ |\\/\\ loved\\ |\\/\\ need|\\|was11|at\\ Hrst|B\\ ullshiz\\'|big\\ lunk|can\\ \\'t|can\\'\\ t\\ |can\\ \\'t\\ |CHA\\ TTERING|come\\ 0n|Come\\ 0n|couldn\\ \\'t|couldn\\'\\ t\\ |couldn\\ \\'t\\ |Destin\\ y\\'s|didn\\ \\'t|didn\\'\\ t\\ |didn\\ \\'t\\ |Doesn\\ \\'1\\'|doesn\\ \\'1\\'\\ |doesn\\ \\'1\\\u2018\\ |doesn\\ \\'t|doesn\\'1\\'\\ |doesn\\'1\\\u2018\\ |don\\ \\'1\\'\\ |don\\ \\'1\\\u2018\\ |don\\ \\'2\\'\\ |\\ aren\\ \\'2\\'|aren\\ \\'2\\'\\ |don\\ \\'2\\\u2018\\ |don\\ \\'t|Don\\'\\ t\\ |Don\\ \\'t\\ |don\\'1\\'\\ |don\\'1\\\u2018\\ |there\\ \\'5\\ |E\\ very|get\\ 0n|go\\ 0n|Go\\ 0n|H3993\\'\\ birthday|hadn\\ \\'t|he\\ \\'s|He\\ \\'s|He\\ y|he\\)\\/|He\\)\\/|HEA\\ VY|Henry\\ ll|Henry\\ lll|Henry\\ Vlll|Henry\\ Vll|Henry\\ Vl|Hold\\ 0n|I\\ am\\.\\ ls|I\\ d0|I\\ \\'m|I\\ \\'rn\\ |I\\ \\'ve|I0\\ ve\\ her|I0\\ ve\\ you|I02\\'|I\\'m\\ sony|isn\\'\\ t\\ |isn\\ \\'t\\ |K\\)\\/le|L\\ ook|let\\ me\\ 90|Let\\ me\\ 90|let\\'s\\ 90|Let\\'s\\ 90|lfl\\ had|lova\\ you|Lova\\ you|lovo\\ you|Lovo\\ you|ls\\ anyone|ls\\ he|\\-ls\\ he|ls\\ it|\\-ls\\ it|ls\\ she|\\-ls\\ she|ls\\ that|\\-ls\\ that|ls\\ this|\\-ls\\ this|Maze\\]\\ tov|N02\\'\\ |\\ of\\ 0ur\\ |\\ ot\\ mine\\ |PLA\\ YING|REPEA\\ TING\\ |Sa\\ y|she\\ \\'s|She\\ \\'s|shouldn\\ \\'t|sta\\ y|Sta\\ y|SWO\\ rd|taka\\ care|Taka\\ care|the\\ Hrst|toc\\ late|uf\\ me|uf\\ our|wa\\ y|Wal\\-I\\\\\\/Iart|wasn\\ \\'1\\'\\ |Wasn\\ \\'1\\'\\ |wasn\\ \\'1\\\u2018\\ |Wasn\\ \\'1\\\u2018\\ |wasn\\ \\'t|Wasn\\ \\'t|we\\ \\'ve|We\\ \\'ve|wem\\'\\ off|weren\\ \\'t|who\\ \\'s|won\\ \\'t|would\\ ha\\ ve|wouldn\\ \\'t|Wouldn\\ \\'t|y\\(\\)u|you\\ QUYS|you\\'\\ re\\ |you\\ \\'re\\ |you\\ \\'ve|You\\ \\'ve|you\\'\\ ve\\ |you\\ \\'ve\\ |aftera\\ while|Aftera\\ while|THUN\\ DERCLAPS|\\(BUZZI\\ N\\ G\\)|\\[BUZZI\\ N\\ G\\]|\\(G\\ RU\\ NTING|\\[G\\ RU\\ NTING|\\(G\\ ROWLING|\\[G\\ ROWLING|\\ WAI\\ LS\\)|\\ WAI\\ LS\\]|\\(scu\\ RRYING\\)|\\[scu\\ RRYING\\]|\\(GRUNT5\\)|\\[GRUNT5\\]|NARRA\\ TOR\\:|\\(GROAN\\ ING|\\[GROAN\\ ING|GROAN\\ ING\\)|GROAN\\ ING\\]|\\(LAUGH\\ ING|\\[LAUGH\\ ING|LAUGH\\ ING\\)|LAUGH\\ ING\\]|\\(BU\\ BBLING|\\[BU\\ BBLING|BU\\ BBLING\\)|BU\\ BBLING\\]|\\(SH\\ USHING|\\[SH\\ USHING|SH\\ USHING\\)|SH\\ USHING\\]|\\(CH\\ ILDREN|\\[CH\\ ILDREN|CH\\ ILDREN\\)|CH\\ ILDREN\\]|\\(MURMU\\ RING|\\[MURMU\\ RING|MURMU\\ RING\\)|MURMU\\ RING\\]|\\(GU\\ N\\ |\\[GU\\ N\\ |GU\\ N\\)|GU\\ N\\]|CH\\ ILDREN\\:|STU\\ DENTS\\:|\\(WH\\ ISTLE|\\[WH\\ ISTLE|WH\\ ISTLE\\)|WH\\ ISTLE\\]|U\\ LU\\ LATING|AU\\ DIENCE\\:|HA\\ WAIIAN|\\(ARTH\\ UR|\\[ARTH\\ UR|ARTH\\ UR\\)|ARTH\\ UR\\]|J\\ EREMY\\:|\\(ELEVA\\ TOR|\\[ELEVA\\ TOR|ELEVA\\ TOR\\)|ELEVA\\ TOR\\]|CONTIN\\ U\\ ES|WIN\\ D\\ HOWLING|telis\\ me|Telis\\ me|\\.\\ Ls\\ |\\!\\ Ls\\ |\\?\\ Ls\\ |\\.\\ Lt\\ |\\!\\ Lt\\ |\\?\\ Lt\\ |SQMEWH\\ ERE\\ ELSE|\\ I\\,m\\ |\\ I\\,ve\\ |\\ you\\,re\\ |\\ you\\,ll\\ |\\ doesn\\,t\\ |\\ let\\,s\\ |\\ he\\,s\\ |\\ it\\,s\\ |\\ can\\,t\\ |\\ Can\\,t\\ |\\ don\\,t\\ |\\ Don\\,t\\ |wouldn\\ \\'tyou|\\ lgot\\ it|\\ you\\,ve\\ |\\ I\\ ve\\ |\\ I\\ ii\\ |\\ I\\ m\\ |\\ why\\ d\\ |\\ couldn\\ t\\ |\\ that\\ s\\ |\\ i\\.\\.\\.\\ |L\\ don\\'t|L\\ won\\'t|L\\ should|L\\ had|L\\ happen|L\\ wasn\\'t|H\\ i|L\\ didn\\'t|L\\ do|L\\ could|L\\ will|L\\ suggest|L\\ reckon|L\\ am|L\\ couldn\\'t|L\\ might|L\\ would|L\\ was|L\\ know|L\\ think|L\\ haven\\'t|L\\ have\\ |L\\ want|L\\ can|L\\ love|L\\ like)(?:(?=\\s)|(?=$)|(?=\\b))"}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\xa4', u'o'), (u'lVI', u'M'), (u'IVl', u'M'), (u'lVl', u'M'), (u'I\\/I', u'M'), (u'l\\/I', u'M'), (u'I\\/l', u'M'), (u'l\\/l', u'M'), (u'IVIa', u'Ma'), (u'IVIe', u'Me'), (u'IVIi', u'Mi'), (u'IVIo', u'Mo'), (u'IVIu', u'Mu'), (u'IVIy', u'My'), (u' l ', u' I '), (u'l/an', u'lian'), (u'\xb0x\xb0', u'%'), (u'\xc3\xc2s', u"'s"), (u'at/on', u'ation'), (u'lljust', u'll just'), (u"'sjust", u"'s just"), (u'compiete', u'complete'), (u' L ', u' I '), (u'a/ion', u'ation'), (u'\xc2s', u"'s"), (u"'tjust", u"'t just"), (u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict([(u'\xa4', u'o'), (u'lVI', u'M'), (u'IVl', u'M'), (u'lVl', u'M'), (u'I\\/I', u'M'), (u'l\\/I', u'M'), (u'I\\/l', u'M'), (u'l\\/l', u'M'), (u'IVIa', u'Ma'), (u'IVIe', u'Me'), (u'IVIi', u'Mi'), (u'IVIo', u'Mo'), (u'IVIu', u'Mu'), (u'IVIy', u'My'), (u' l ', u' I '), (u'l/an', u'lian'), (u'\xb0x\xb0', u'%'), (u'\xc3\xc2s', u"'s"), (u'at/on', u'ation'), (u'lljust', u'll just'), (u"'sjust", u"'s just"), (u'compiete', u'complete'), (u' L ', u' I '), (u'a/ion', u'ation'), (u'\xc2s', u"'s"), (u"'tjust", u"'t just")]), 'pattern': None}, 'WholeLines': {'data': OrderedDict([(u'H ey.', u'Hey.'), (u'He)\u2019-', u'Hey.'), (u'N0.', u'No.'), (u'-N0.', u'-No.'), (u'Noll', u'No!!'), (u'(G ROANS)', u'(GROANS)'), (u'[G ROANS]', u'[GROANS]'), (u'(M EOWS)', u'(MEOWS)'), (u'[M EOWS]', u'[MEOWS]'), (u'Uaughs]', u'[laughs]'), (u'[chitte rs]', u'[chitters]'), (u'Hil\u2018 it!', u'Hit it!'), (u'Hil\u2018 it!', u'Hit it!'), (u'ISIGHS]', u'[SIGHS]')]), 'pattern': None}, @@ -54,7 +54,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict(), 'pattern': None}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict(), 'pattern': None}, 'WholeLines': {'data': OrderedDict([(u'Katsokaa pa.', u'Katsokaapa.'), (u'Mik!\r\n""e\u201c9ir\xe4\u0131', u'Mik!\r\n-Hengit\xe4!'), (u'Tu lta!', u'Tulta!'), (u'...0I1...', u'...on...'), (u'Ken g\xe4n nauh oja?', u'Keng\xe4nnauhoja?'), (u'k\xe3mmott\xe3V\xe3ll\xe3 mUiStoll\xe3.', u'kammottavana muistona.'), (u'H\xe4n n\xe4ki naisen menev\xe4n\r\nkellarikerroksen ksen asun to o ns\xe4.', u'H\xe4n n\xe4ki naisen menev\xe4n\r\nkellarikerroksen asuntoonsa.'), (u'Min\xe4 etsin k\xe4siini miehen, joka\r\non aurtanutiestradea ja minua:', u'Min\xe4 etsin k\xe4siini miehen, joka\r\non auttanut Lestradea ja minua:'), (u'Huomaa erityisesti\r\npunaisella merkitty "kitoris.', u'Huomaa erityisesti\r\npunaisella merkitty "klitoris".'), (u'Tulkaa, meill\xe4 on\r\nHa va\u0131ji-bileet suihkussa', u'Tulkaa, meill\xe4 on\r\nHavaiji-bileet suihkussa'), (u'Ta rkoitatko ett\xe4\r\nh\xe4net myrkytettiin?', u'Tarkoitatko ett\xe4\r\nh\xe4net myrkytettiin?'), (u'Odotta kaa soittoani\r\nIev\xe4hdyspaikalla.', u'Odottakaa soittoani\r\nlev\xe4hdyspaikalla.'), (u'Nyt kuuntelet, perska rva.', u'Nyt kuuntelet, perskarva.'), (u'Ta patko h\xe4net sitten?', u'Tapatko h\xe4net sitten?'), (u'Seuraa vissa va/oissa.', u'Seuraavissa valoissa.'), (u'A\u0131o\u0131t rapattaa minut.\r\n- En.', u'Aioit tapattaa minut.\r\n- En.'), (u'Todella vaku uttavaa.', u'Todella vakuuttavaa.'), (u'I-le ovat tuolla alhaalla', u'He ovat tuolla alhaalla'), (u'Nainen kuuluu minulle.\r\n- I-I\xe0ivy siit\xe4, ylikasvuinen hyttynen!', u'Nainen kuuluu minulle.\r\n- H\xe4ivy siit\xe4, ylikasvuinen hyttynen!'), (u'I-Ialuatte k\xe4ytt\xe4\xe4 pyh\xe0\xe0 kive\xe4\r\ndynastian aarteen l\xf6yt\xe4miseksi.', u'Haluatte k\xe4ytt\xe4\xe4 pyh\xe0\xe0 kive\xe4\r\ndynastian aarteen l\xf6yt\xe4miseksi.'), (u'Mit\xe4f?', u'Mit\xe4.. ?'), (u'Kuuluuko Hiru ko-klaanista mit\xe4\xe4n?\r\n- Ninjasoturit ovat edell\xe4mme.', u'Kuuluuko Hiruko-klaanista mit\xe4\xe4n?\r\n- Ninjasoturit ovat edell\xe4mme.'), (u'Anteeks\u0131} painoin kai... -', u'Anteeksi, painoin kai... -'), (u'ja Rea! Ho usew\xedves.', u'ja Real Housewives.'), (u'Et halu n n ut Julkkistansseihinkaan.', u'Et halunnut Julkkistansseihinkaan.'), (u'Laard i k\xe4si?', u'Laardik\xe4si?'), (u'Varo kaa!', u'Varokaa!'), (u'N\xe4ytt\xe4v\xe4t k\xf6 tyt\xf6t v\xe4h \xe4n\r\nh uorahtavi m m i lta?', u'N\xe4ytt\xe4v\xe4tk\xf6 tyt\xf6t v\xe4h\xe4n\r\nhuorahtavimmilta?'), (u'Stif... Ier.', u'Stif... ler.'), (u'J u mantsu kka! M it\xe4?', u'Jumantsukka! Mit\xe4?'), (u'Varasin Ch u m bawam ban,', u'Varasin Chumbawamban,'), (u'J u malavita!', u'Jumalavita!'), (u'S', u'Isi...'), (u'Haluan kertoa jotai n', u'Haluan kertoa jotain'), (u'I-Ialuatte', u'Haluatte')]), 'pattern': None}, @@ -66,7 +66,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict([(u" I'", u" l'"), (u" |'", u" l'")]), 'pattern': u"(?um)(?:(?<=\\s)|(?<=^)|(?<=\\b))(?:\\ I\\'|\\ \\|\\')(?:(?=\\s)|(?=$)|(?=\\b))"}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict(), 'pattern': None}, 'WholeLines': {'data': OrderedDict([(u'"D\'ac:c:ord."', u'"D\'accord."'), (u'\u201ci QU\xce gagne, qui perd,', u'ni qui gagne, qui perd,'), (u"L'ac:c:ent est mis \r\n \r\n sur son trajet jusqu'en Suisse.", u"L'accent est mis \r\n \r\n sur son trajet jusqu'en Suisse."), (u"C'est la plus gentille chose \r\n \r\n qu'Hitchc:oc:k m'ait jamais dite.", u"C'est la plus gentille chose \r\n \r\n qu'Hitchcock m'ait jamais dite."), (u"Tout le monde, en revanche, qualifie \r\n \r\n Goldfinger d'aventu re structur\xe9e,", u"Tout le monde, en revanche, qualifie \r\n \r\n Goldfinger d'aventure structur\xe9e,"), (u'et le film Shadow of a man \r\n \r\n a lanc\xe9 sa carri\xe8re au cin\xe9ma.', u'et le film Shadow of a man \r\n \r\n a lanc\xe9 sa carri\xe8re au cin\xe9ma.'), (u'En 1948, Young est pass\xe9 \xe0 la r\xe9alisation \r\n \r\n avec One night with you.', u'En 1948, Young est pass\xe9 \xe0 la r\xe9alisation \r\n \r\n avec One night with you.'), (u'Il a construit tous ces v\xe9hicules \r\n \r\n \xe0 C)c:ala, en Floride.', u'Il a construit tous ces v\xe9hicules \r\n \r\n \xe0 Ocala, en Floride.'), (u'Tokyo Pop et A Taxing Woman? Return.', u"Tokyo Pop et A Taxing Woman's Return."), (u'Peter H u nt.', u'Peter Hunt.'), (u'"C\'est bien mieux dans Peau. \r\n \r\n On peut s\ufb02\xe9clabousser, faire du bruit."', u'"C\'est bien mieux dans l\'eau. \r\n \r\n On peut s\'\xe9clabousser, faire du bruit."')]), 'pattern': None}, @@ -78,7 +78,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict([(u'Ako ej', u'Ako je'), (u'ako ej', u'ako je'), (u'bez svesti', u'bez svijesti'), (u'Bi\u0107u uz', u'Bit \u0107u uz'), (u'bi ja', u'bih ja'), (u'bi la', u'bila'), (u'biti uredu', u'biti u redu'), (u'bi da bude', u'bi biti'), (u'Bi ste', u'Biste'), (u'Bilo ko', u'Bilo tko'), (u'bilo ko', u'bilo tko'), (u'\u0107e da do\u0111e', u'\u0107e do\u0107i'), (u'Da li \u0107e', u'Ho\u0107e li'), (u'Da li \u0107emo', u'Ho\u0107emo li'), (u'Da li \u0107u', u'Ho\u0107u li'), (u'da li \u0107u', u'ho\u0107u li'), (u'dali \u0107u', u'ho\u0107u li'), (u'Da li je', u'Je li'), (u'da li je', u'je li'), (u'dali je', u'je li'), (u'Da li ste', u'Jeste li'), (u'Da li si', u'Jesi li'), (u'dali si', u'jesi li'), (u'da li \u0107e', u'ho\u0107e li'), (u'dali \u0107e', u'ho\u0107e li'), (u'do srede', u'do srijede'), (u'Dobro ve\u010de', u'Dobra ve\u010der'), (u'Dobro ve\u010der', u'Dobra ve\u010der'), (u'Dobar ve\u010der', u'Dobra ve\u010der'), (u'gdje ide\u0161', u'kamo ide\u0161'), (u'Gdje ide\u0161', u'Kamo ide\u0161'), (u'Gdje sada', u'Kamo sada'), (u'gle ko', u'gle tko'), (u'ho\u0107u da budem', u'\u017eelim biti'), (u'Ho\u0107u da budem', u'\u017delim biti'), (u'ho\u0107u da ka\u017eem', u'\u017eelim re\u0107i'), (u'ho\u0107e\u0161 da ka\u017ee\u0161', u'\u017eeli\u0161 re\u0107i'), (u'ho\u0107e da ka\u017ee', u'\u017eeli re\u0107i'), (u'ho\u0107u da \u017eivim', u'\u017eelim \u017eivjeti'), (u'Izvini se', u'Ispri\u010daj se'), (u'izvini se', u'ispri\u010daj se'), (u'Izvinite me', u'Ispri\u010dajte me'), (u'Izvinite nas', u'Ispri\u010dajte nas'), (u'izvinite nas', u'ispri\u010dajte nas'), (u'Izvinjavamo se', u'Ispri\u010davamo se'), (u'ja bi', u'ja bih'), (u'Ja bi', u'Ja bih'), (u'Jel sam ti', u'Jesam li ti'), (u'Jeli se', u'Je li se'), (u'Jeli sve', u'Je li sve'), (u'Jeli ti', u'Je li ti'), (u'ko je', u'tko je'), (u'ko si', u'tko si'), (u'ko ti je', u'tko ti je'), (u'ko te je', u'tko te je'), (u'ko zna', u'tko zna'), (u'mo\u0107i da idemo', u'mo\u0107i i\u0107i'), (u'moglo da bude', u'moglo biti'), (u'moje sau\u010de\u0161\u0107e', u'moja su\u0107ut'), (u'mora da bude', u'mora biti'), (u'moram da budem', u'moram biti'), (u'Moram da idem', u'Moram i\u0107i'), (u'moram da idem', u'moram i\u0107i'), (u'Mora\u0161 da ide\u0161', u'Mora\u0161 i\u0107i'), (u'Moramo da idemo', u'Moramo i\u0107i'), (u'moram da vidim', u'moram vidjeti'), (u'moram da zaboravim', u'moram zaboraviti'), (u'mora\u0161 da zaboravi\u0161', u'mora\u0161 zaboraviti'), (u'mora da zna', u'mora znati'), (u'moram da znam', u'moram znati'), (u'Moram da znam', u'Moram znati'), (u'mora\u0161 da zna\u0161', u'mora\u0161 znati'), (u'mora\u0161 da ide\u0161', u'mora\u0161 i\u0107i'), (u'mo\u017ee da bude', u'mo\u017ee biti'), (u'mo\u017ee\u0161 da bude\u0161', u'mo\u017ee\u0161 biti'), (u'mo\u017ee da di\u0161e', u'mo\u017ee disati'), (u'mo\u017ee\u0161 da dobije\u0161', u'mo\u017ee\u0161 dobiti'), (u'mo\u017eemo da imamo', u'mo\u017eemo imati'), (u'na ve\u010der', u'nave\u010der'), (u'Na ve\u010der', u'Nave\u010der'), (u'ne\u0107e da bude', u'ne\u0107e biti'), (u'ne\u0107e\u0161 da bude\u0161', u'ne\u0107e\u0161 biti'), (u'ne\u0107e\u0161 da po\u017eali\u0161', u'ne\u0107e\u0161 po\u017ealiti'), (u'Neko ko', u'Netko tko'), (u'neko ko', u'netko tko'), (u'neko ne\u0161to', u'netko ne\u0161to'), (u'nedjelju dana', u'tjedan dana'), (u'Ne mogu da verujem', u'Ne mogu vjerovati'), (u'new yor\u0161ki', u'njujor\u0161ki'), (u'nju jor\u0161ki', u'njujor\u0161ki'), (u'od kako', u'otkako'), (u'Pla\u0161im se', u'Bojim se'), (u'pla\u0161im se', u'bojim se'), (u'pravo u o\u010di', u'ravno u o\u010di'), (u'sa njim', u's njim'), (u'sa njima', u's njima'), (u'sa njom', u's njom'), (u'sa tim', u's tim'), (u'sa tom', u's tom'), (u'sa tobom', u's tobom'), (u'sa vama', u's vama'), (u'sam da budem', u'sam biti'), (u'si\u0107i dolje', u'si\u0107i'), (u'Si dobro', u'Jesi li dobro'), (u'Svako ko', u'Svatko tko'), (u'Svo vreme', u'Sve vrijeme'), (u'Svo vrijeme', u'Sve vrijeme'), (u'smeo da', u'smio'), (u'smeli da', u'smjeli'), (u'\u0160to ej', u'\u0160to je'), (u'\u0161to ej', u'\u0161to je'), (u'to j', u'to je'), (u'to ej', u'to je'), (u'To ej', u'To je'), (u'tamo natrag', u'tamo iza'), (u'tamo je natrag', u'tamo je iza'), (u'Tamo je natrag', u'Tamo je iza'), (u'treba da bude', u'treba biti'), (u'u jutro', u'ujutro'), (u'u\u0107i unutra', u'u\u0107i'), (u'vas je lagao', u'vam je lagao'), (u'za uvijek', u'zauvijek'), (u'zato sto', u'zato \u0161to'), (u'zna da bude', u'zna biti'), (u'zna ko', u'zna tko'), (u'znati ko', u'znati tko'), (u'\u017eele da budu', u'\u017eele biti'), (u'\u017eeli da bude', u'\u017eeli biti'), (u'\u017eelio da budem', u'\u017eelio biti'), (u'\u017eelim da budem', u'\u017eelim biti'), (u'\u017delim da budem', u'\u017delim biti'), (u'\u017eeli\u0161 da bude\u0161', u'\u017eeli\u0161 biti'), (u'\u017eelim da idem', u'\u017eelim i\u0107i'), (u'\u017eelim da odem', u'\u017eelim oti\u0107i'), (u'\u017eeli\u0161 da ode\u0161', u'\u017eeli\u0161 oti\u0107i'), (u'\u017eeli\u0161 da u\u0111e\u0161', u'\u017eeli\u0161 u\u0107i'), (u'\u017eelim da umrem', u'\u017eelim umrijeti'), (u'\u017delim da znam', u'\u017delim znati'), (u'\u017eelim da znam', u'\u017eelim znati'), (u'\u017eeli\u0161 da zna\u0161', u'\u017eeli\u0161 znati')]), 'pattern': u'(?um)(?:(?<=\\s)|(?<=^)|(?<=\\b))(?:Ako\\ ej|ako\\ ej|bez\\ svesti|Bi\\\u0107u\\ uz|bi\\ ja|bi\\ la|biti\\ uredu|bi\\ da\\ bude|Bi\\ ste|Bilo\\ ko|bilo\\ ko|\\\u0107e\\ da\\ do\\\u0111e|Da\\ li\\ \\\u0107e|Da\\ li\\ \\\u0107emo|Da\\ li\\ \\\u0107u|da\\ li\\ \\\u0107u|dali\\ \\\u0107u|Da\\ li\\ je|da\\ li\\ je|dali\\ je|Da\\ li\\ ste|Da\\ li\\ si|dali\\ si|da\\ li\\ \\\u0107e|dali\\ \\\u0107e|do\\ srede|Dobro\\ ve\\\u010de|Dobro\\ ve\\\u010der|Dobar\\ ve\\\u010der|gdje\\ ide\\\u0161|Gdje\\ ide\\\u0161|Gdje\\ sada|gle\\ ko|ho\\\u0107u\\ da\\ budem|Ho\\\u0107u\\ da\\ budem|ho\\\u0107u\\ da\\ ka\\\u017eem|ho\\\u0107e\\\u0161\\ da\\ ka\\\u017ee\\\u0161|ho\\\u0107e\\ da\\ ka\\\u017ee|ho\\\u0107u\\ da\\ \\\u017eivim|Izvini\\ se|izvini\\ se|Izvinite\\ me|Izvinite\\ nas|izvinite\\ nas|Izvinjavamo\\ se|ja\\ bi|Ja\\ bi|Jel\\ sam\\ ti|Jeli\\ se|Jeli\\ sve|Jeli\\ ti|ko\\ je|ko\\ si|ko\\ ti\\ je|ko\\ te\\ je|ko\\ zna|mo\\\u0107i\\ da\\ idemo|moglo\\ da\\ bude|moje\\ sau\\\u010de\\\u0161\\\u0107e|mora\\ da\\ bude|moram\\ da\\ budem|Moram\\ da\\ idem|moram\\ da\\ idem|Mora\\\u0161\\ da\\ ide\\\u0161|Moramo\\ da\\ idemo|moram\\ da\\ vidim|moram\\ da\\ zaboravim|mora\\\u0161\\ da\\ zaboravi\\\u0161|mora\\ da\\ zna|moram\\ da\\ znam|Moram\\ da\\ znam|mora\\\u0161\\ da\\ zna\\\u0161|mora\\\u0161\\ da\\ ide\\\u0161|mo\\\u017ee\\ da\\ bude|mo\\\u017ee\\\u0161\\ da\\ bude\\\u0161|mo\\\u017ee\\ da\\ di\\\u0161e|mo\\\u017ee\\\u0161\\ da\\ dobije\\\u0161|mo\\\u017eemo\\ da\\ imamo|na\\ ve\\\u010der|Na\\ ve\\\u010der|ne\\\u0107e\\ da\\ bude|ne\\\u0107e\\\u0161\\ da\\ bude\\\u0161|ne\\\u0107e\\\u0161\\ da\\ po\\\u017eali\\\u0161|Neko\\ ko|neko\\ ko|neko\\ ne\\\u0161to|nedjelju\\ dana|Ne\\ mogu\\ da\\ verujem|new\\ yor\\\u0161ki|nju\\ jor\\\u0161ki|od\\ kako|Pla\\\u0161im\\ se|pla\\\u0161im\\ se|pravo\\ u\\ o\\\u010di|sa\\ njim|sa\\ njima|sa\\ njom|sa\\ tim|sa\\ tom|sa\\ tobom|sa\\ vama|sam\\ da\\ budem|si\\\u0107i\\ dolje|Si\\ dobro|Svako\\ ko|Svo\\ vreme|Svo\\ vrijeme|smeo\\ da|smeli\\ da|\\\u0160to\\ ej|\\\u0161to\\ ej|to\\ j|to\\ ej|To\\ ej|tamo\\ natrag|tamo\\ je\\ natrag|Tamo\\ je\\ natrag|treba\\ da\\ bude|u\\ jutro|u\\\u0107i\\ unutra|vas\\ je\\ lagao|za\\ uvijek|zato\\ sto|zna\\ da\\ bude|zna\\ ko|znati\\ ko|\\\u017eele\\ da\\ budu|\\\u017eeli\\ da\\ bude|\\\u017eelio\\ da\\ budem|\\\u017eelim\\ da\\ budem|\\\u017delim\\ da\\ budem|\\\u017eeli\\\u0161\\ da\\ bude\\\u0161|\\\u017eelim\\ da\\ idem|\\\u017eelim\\ da\\ odem|\\\u017eeli\\\u0161\\ da\\ ode\\\u0161|\\\u017eeli\\\u0161\\ da\\ u\\\u0111e\\\u0161|\\\u017eelim\\ da\\ umrem|\\\u017delim\\ da\\ znam|\\\u017eelim\\ da\\ znam|\\\u017eeli\\\u0161\\ da\\ zna\\\u0161)(?:(?=\\s)|(?=$)|(?=\\b))'}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict(), 'pattern': None}, 'WholeLines': {'data': OrderedDict(), 'pattern': None}, @@ -90,7 +90,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict(), 'pattern': None}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict(), 'pattern': None}, 'WholeLines': {'data': OrderedDict(), 'pattern': None}, @@ -102,7 +102,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict(), 'pattern': None}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\u010d', u'\xe8'), (u'I\u05bb', u'I'), (u'\u05d0', u'\xe0'), (u'\u05d9', u'\xe9'), (u'\u05d8', u'\xe8'), (u'\u05db', u'\xeb'), (u'\u05dd', u'i'), (u'\u05df', u'\xef'), (u'\u05e3', u'\xf3'), (u'\u05e4', u'o'), (u'\u05e6', u'\xeb'), (u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict([(u'\u010d', u'\xe8'), (u'I\u05bb', u'I'), (u'\u05d0', u'\xe0'), (u'\u05d9', u'\xe9'), (u'\u05d8', u'\xe8'), (u'\u05db', u'\xeb'), (u'\u05dd', u'i'), (u'\u05df', u'\xef'), (u'\u05e3', u'\xf3'), (u'\u05e4', u'o'), (u'\u05e6', u'\xeb')]), 'pattern': None}, 'WholeLines': {'data': OrderedDict(), 'pattern': None}, @@ -114,7 +114,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict(), 'pattern': None}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict(), 'pattern': None}, 'WholeLines': {'data': OrderedDict(), 'pattern': None}, @@ -126,7 +126,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict(), 'pattern': None}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict(), 'pattern': None}, 'WholeLines': {'data': OrderedDict(), 'pattern': None}, @@ -138,7 +138,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict([(u'IN 6-E', u'N 6 E'), (u'in tegrar-se', u'integrar-se'), (u'in teresse', u'interesse'), (u'in testinos', u'intestinos'), (u'indica \xe7\xe3o', u'indica\xe7\xe3o'), (u'inte tino', u'intestino'), (u'intes tinos', u'intestinos'), (u'L da', u'Lda'), (u'mal estar', u'mal-estar'), (u'mastiga \xe7\xe1o', u'mastiga\xe7\xe3o'), (u'm\xe9di cas', u'm\xe9dicas'), (u'mineo rais', u'minerais'), (u'mola res', u'molares'), (u'movi mentos', u'movimentos'), (u'movimen to', u'movimento'), (u'N 5-Estendido', u'N\xba 5 Estendido'), (u'oxig\xe9 nio', u'oxig\xe9nio'), (u'pod mos', u'podemos'), (u'poder-se ia', u'poder-se-ia'), (u'pos sibilidade', u'possibilidade'), (u'possibi lidades', u'possibilidades'), (u'pro duto', u'produto'), (u'procu rar', u'procurar'), (u'Q u e', u'Que'), (u'qualifi cam', u'qualificam'), (u'R egi\xe3o', u'Regi\xe3o'), (u'unsuficien temente', u'insuficientemente')]), 'pattern': u'(?um)(?:(?<=\\s)|(?<=^)|(?<=\\b))(?:IN\\ 6\\-E|in\\ tegrar\\-se|in\\ teresse|in\\ testinos|indica\\ \\\xe7\\\xe3o|inte\\ tino|intes\\ tinos|L\\ da|mal\\ estar|mastiga\\ \\\xe7\\\xe1o|m\\\xe9di\\ cas|mineo\\ rais|mola\\ res|movi\\ mentos|movimen\\ to|N\\ 5\\-Estendido|oxig\\\xe9\\ nio|pod\\ mos|poder\\-se\\ ia|pos\\ sibilidade|possibi\\ lidades|pro\\ duto|procu\\ rar|Q\\ u\\ e|qualifi\\ cam|R\\ egi\\\xe3o|unsuficien\\ temente)(?:(?=\\s)|(?=$)|(?=\\b))'}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict(), 'pattern': None}, 'WholeLines': {'data': OrderedDict(), 'pattern': None}, @@ -150,7 +150,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict(), 'pattern': None}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict(), 'pattern': None}, 'WholeLines': {'data': OrderedDict(), 'pattern': None}, @@ -162,7 +162,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': u'(?um)(?:\\.\\\xbb\\.)$'}, 'PartialLines': {'data': OrderedDict([(u'de gratis', u'gratis'), (u'si quiera', u'siquiera'), (u'Cada una de los', u'Cada uno de los'), (u'Cada uno de las', u'Cada una de las'), (u'haber que', u'a ver qu\xe9'), (u'haber qu\xe9', u'a ver qu\xe9'), (u'Haber si', u'A ver si'), (u' que hora', u' qu\xe9 hora'), (u'yo que se', u'yo qu\xe9 s\xe9'), (u'Yo que se', u'Yo qu\xe9 s\xe9'), (u' tu!', u' t\xfa!'), (u' si!', u' s\xed!'), (u' mi!', u' m\xed!'), (u' el!', u' \xe9l!'), (u' tu?', u' t\xfa?'), (u' si?', u' s\xed?'), (u' mi?', u' m\xed?'), (u' el?', u' \xe9l?'), (u' aun?', u' a\xfan?'), (u' mas?', u' m\xe1s?'), (u' que?', u' qu\xe9?'), (u' paso?', u' pas\xf3?'), (u' cuando?', u' cu\xe1ndo?'), (u' cuanto?', u' cu\xe1nto?'), (u' cuanta?', u' cu\xe1nta?'), (u' cuantas?', u' cu\xe1ntas?'), (u' cuantos?', u' cu\xe1ntos?'), (u' donde?', u' d\xf3nde?'), (u' quien?', u' qui\xe9n?'), (u' como?', u' c\xf3mo?'), (u' adonde?', u' ad\xf3nde?'), (u' cual?', u' cu\xe1l?'), (u'\xbfSi?', u'\xbfS\xed?'), (u'\xbfesta bien?', u'\xbfest\xe1 bien?'), (u'\xbfPero qu\xe9 haces?', u'\xa1\xbfPero qu\xe9 haces?!'), (u'\xbfpero qu\xe9 haces?', u'\xa1\xbfpero qu\xe9 haces?!'), (u'\xbfEs que no me has escuchado?', u'\xa1\xbfEs que no me has escuchado?!'), (u'\xa1\xbfes que no me has escuchado?!', u'\xa1\xbfes que no me has escuchado?!'), (u'\xbfaun', u'\xbfa\xfan'), (u'\xbftu ', u'\xbft\xfa '), (u'\xbfque ', u'\xbfqu\xe9 '), (u'\xbfsabes que', u'\xbfsabes qu\xe9'), (u'\xbfsabes adonde', u'\xbfsabes ad\xf3nde'), (u'\xbfsabes cual', u'\xbfsabes cu\xe1l'), (u'\xbfsabes quien', u'\xbfsabes qui\xe9n'), (u'\xbfsabes como', u'\xbfsabes c\xf3mo'), (u'\xbfsabes cuan', u'\xbfsabes cu\xe1n'), (u'\xbfsabes cuanto', u'\xbfsabes cu\xe1nto'), (u'\xbfsabes cuanta', u'\xbfsabes cu\xe1nta'), (u'\xbfsabes cuantos', u'\xbfsabes cu\xe1ntos'), (u'\xbfsabes cuantas', u'\xbfsabes cu\xe1ntas'), (u'\xbfsabes cuando', u'\xbfsabes cu\xe1ndo'), (u'\xbfsabes donde', u'\xbfsabes d\xf3nde'), (u'\xbfsabe que', u'\xbfsabe qu\xe9'), (u'\xbfsabe adonde', u'\xbfsabe ad\xf3nde'), (u'\xbfsabe cual', u'\xbfsabe cu\xe1l'), (u'\xbfsabe quien', u'\xbfsabe qui\xe9n'), (u'\xbfsabe como', u'\xbfsabe c\xf3mo'), (u'\xbfsabe cuan', u'\xbfsabe cu\xe1n'), (u'\xbfsabe cuanto', u'\xbfsabe cu\xe1nto'), (u'\xbfsabe cuanta', u'\xbfsabe cu\xe1nta'), (u'\xbfsabe cuantos', u'\xbfsabe cu\xe1ntos'), (u'\xbfsabe cuantas', u'\xbfsabe cu\xe1ntas'), (u'\xbfsabe cuando', u'\xbfsabe cu\xe1ndo'), (u'\xbfsabe donde', u'\xbfsabe d\xf3nde'), (u'\xbfsaben que', u'\xbfsaben qu\xe9'), (u'\xbfsaben adonde', u'\xbfsaben ad\xf3nde'), (u'\xbfsaben cual', u'\xbfsaben cu\xe1l'), (u'\xbfsaben quien', u'\xbfsaben qui\xe9n'), (u'\xbfsaben como', u'\xbfsaben c\xf3mo'), (u'\xbfsaben cuan', u'\xbfsaben cu\xe1n'), (u'\xbfsaben cuanto', u'\xbfsaben cu\xe1nto'), (u'\xbfsaben cuanta', u'\xbfsaben cu\xe1nta'), (u'\xbfsaben cuantos', u'\xbfsaben cu\xe1ntos'), (u'\xbfsaben cuantas', u'\xbfsaben cu\xe1ntas'), (u'\xbfsaben cuando', u'\xbfsaben cu\xe1ndo'), (u'\xbfsaben donde', u'\xbfsaben d\xf3nde'), (u'\xbfde que', u'\xbfde qu\xe9'), (u'\xbfde donde', u'\xbfde d\xf3nde'), (u'\xbfde cual', u'\xbfde cu\xe1l'), (u'\xbfde quien', u'\xbfde qui\xe9n'), (u'\xbfde cuanto', u'\xbfde cu\xe1nto'), (u'\xbfde cuanta', u'\xbfde cu\xe1nta'), (u'\xbfde cuantos', u'\xbfde cu\xe1ntos'), (u'\xbfde cuantas', u'\xbfde cu\xe1ntas'), (u'\xbfde cuando', u'\xbfde cu\xe1ndo'), (u'\xbfsobre que', u'\xbfsobre qu\xe9'), (u'\xbfcomo ', u'\xbfc\xf3mo '), (u'\xbfcual ', u'\xbfcu\xe1l '), (u'\xbfen cual', u'\xbfen cu\xe1l'), (u'\xbfcuando', u'\xbfcu\xe1ndo'), (u'\xbfhasta cual', u'\xbfhasta cu\xe1l'), (u'\xbfhasta quien', u'\xbfhasta qui\xe9n'), (u'\xbfhasta cuanto', u'\xbfhasta cu\xe1nto'), (u'\xbfhasta cuantas', u'\xbfhasta cu\xe1ntas'), (u'\xbfhasta cuantos', u'\xbfhasta cu\xe1ntos'), (u'\xbfhasta cuando', u'\xbfhasta cu\xe1ndo'), (u'\xbfhasta donde', u'\xbfhasta d\xf3nde'), (u'\xbfhasta que', u'\xbfhasta qu\xe9'), (u'\xbfhasta adonde', u'\xbfhasta ad\xf3nde'), (u'\xbfdesde que', u'\xbfdesde qu\xe9'), (u'\xbfdesde cuando', u'\xbfdesde cu\xe1ndo'), (u'\xbfdesde quien', u'\xbfdesde qui\xe9n'), (u'\xbfdesde donde', u'\xbfdesde d\xf3nde'), (u'\xbfcuanto', u'\xbfcu\xe1nto'), (u'\xbfcuantos', u'\xbfcu\xe1ntos'), (u'\xbfdonde', u'\xbfd\xf3nde'), (u'\xbfadonde', u'\xbfad\xf3nde'), (u'\xbfcon que', u'\xbfcon qu\xe9'), (u'\xbfcon cual', u'\xbfcon cu\xe1l'), (u'\xbfcon quien', u'\xbfcon qui\xe9n'), (u'\xbfcon cuantos', u'\xbfcon cu\xe1ntos'), (u'\xbfcon cuantas', u'\xbfcon cu\xe1ntas'), (u'\xbfcon cuanta', u'\xbfcon cu\xe1nta'), (u'\xbfcon cuanto', u'\xbfcon cu\xe1nto'), (u'\xbfpara donde', u'\xbfpara d\xf3nde'), (u'\xbfpara adonde', u'\xbfpara ad\xf3nde'), (u'\xbfpara cuando', u'\xbfpara cu\xe1ndo'), (u'\xbfpara que', u'\xbfpara qu\xe9'), (u'\xbfpara quien', u'\xbfpara qui\xe9n'), (u'\xbfpara cuanto', u'\xbfpara cu\xe1nto'), (u'\xbfpara cuanta', u'\xbfpara cu\xe1nta'), (u'\xbfpara cuantos', u'\xbfpara cu\xe1ntos'), (u'\xbfpara cuantas', u'\xbfpara cu\xe1ntas'), (u'\xbfa donde', u'\xbfa d\xf3nde'), (u'\xbfa que', u'\xbfa qu\xe9'), (u'\xbfa cual', u'\xbfa cu\xe1l'), (u'\xbfa quien', u'\xbfa quien'), (u'\xbfa como', u'\xbfa c\xf3mo'), (u'\xbfa cuanto', u'\xbfa cu\xe1nto'), (u'\xbfa cuanta', u'\xbfa cu\xe1nta'), (u'\xbfa cuantos', u'\xbfa cu\xe1ntos'), (u'\xbfa cuantas', u'\xbfa cu\xe1ntas'), (u'\xbfpor que', u'\xbfpor qu\xe9'), (u'\xbfpor cual', u'\xbfpor cu\xe1l'), (u'\xbfpor quien', u'\xbfpor qui\xe9n'), (u'\xbfpor cuanto', u'\xbfpor cu\xe1nto'), (u'\xbfpor cuanta', u'\xbfpor cu\xe1nta'), (u'\xbfpor cuantos', u'\xbfpor cu\xe1ntos'), (u'\xbfpor cuantas', u'\xbfpor cu\xe1ntas'), (u'\xbfpor donde', u'\xbfpor d\xf3nde'), (u'\xbfporque', u'\xbfpor qu\xe9'), (u'\xbfporqu\xe9', u'\xbfpor qu\xe9'), (u'\xbfy que', u'\xbfy qu\xe9'), (u'\xbfy como', u'\xbfy c\xf3mo'), (u'\xbfy cuando', u'\xbfy cu\xe1ndo'), (u'\xbfy cual', u'\xbfy cu\xe1l'), (u'\xbfy quien', u'\xbfy qui\xe9n'), (u'\xbfy cuanto', u'\xbfy cu\xe1nto'), (u'\xbfy cuanta', u'\xbfy cu\xe1nta'), (u'\xbfy cuantos', u'\xbfy cu\xe1ntos'), (u'\xbfy cuantas', u'\xbfy cu\xe1ntas'), (u'\xbfy donde', u'\xbfy d\xf3nde'), (u'\xbfy adonde', u'\xbfy ad\xf3nde'), (u'\xbfquien ', u'\xbfqui\xe9n '), (u'\xbfesta ', u'\xbfest\xe1 '), (u'\xbfestas ', u'\xbfest\xe1s '), (u'\xbfAun', u'\xbfA\xfan'), (u'\xbfQue ', u'\xbfQu\xe9 '), (u'\xbfSabes que', u'\xbfSabes qu\xe9'), (u'\xbfSabes adonde', u'\xbfSabes ad\xf3nde'), (u'\xbfSabes cual', u'\xbfSabes cu\xe1l'), (u'\xbfSabes quien', u'\xbfSabes qui\xe9n'), (u'\xbfSabes como', u'\xbfSabes c\xf3mo'), (u'\xbfSabes cuan', u'\xbfSabes cu\xe1n'), (u'\xbfSabes cuanto', u'\xbfSabes cu\xe1nto'), (u'\xbfSabes cuanta', u'\xbfSabes cu\xe1nta'), (u'\xbfSabes cuantos', u'\xbfSabes cu\xe1ntos'), (u'\xbfSabes cuantas', u'\xbfSabes cu\xe1ntas'), (u'\xbfSabes cuando', u'\xbfSabes cu\xe1ndo'), (u'\xbfSabes donde', u'\xbfSabes d\xf3nde'), (u'\xbfSabe que', u'\xbfSabe qu\xe9'), (u'\xbfSabe adonde', u'\xbfSabe ad\xf3nde'), (u'\xbfSabe cual', u'\xbfSabe cu\xe1l'), (u'\xbfSabe quien', u'\xbfSabe qui\xe9n'), (u'\xbfSabe como', u'\xbfSabe c\xf3mo'), (u'\xbfSabe cuan', u'\xbfSabe cu\xe1n'), (u'\xbfSabe cuanto', u'\xbfSabe cu\xe1nto'), (u'\xbfSabe cuanta', u'\xbfSabe cu\xe1nta'), (u'\xbfSabe cuantos', u'\xbfSabe cu\xe1ntos'), (u'\xbfSabe cuantas', u'\xbfSabe cu\xe1ntas'), (u'\xbfSabe cuando', u'\xbfSabe cu\xe1ndo'), (u'\xbfSabe donde', u'\xbfSabe d\xf3nde'), (u'\xbfSaben que', u'\xbfSaben qu\xe9'), (u'\xbfSaben adonde', u'\xbfSaben ad\xf3nde'), (u'\xbfSaben cual', u'\xbfSaben cu\xe1l'), (u'\xbfSaben quien', u'\xbfSaben qui\xe9n'), (u'\xbfSaben como', u'\xbfSaben c\xf3mo'), (u'\xbfSaben cuan', u'\xbfSaben cu\xe1n'), (u'\xbfSaben cuanto', u'\xbfSaben cu\xe1nto'), (u'\xbfSaben cuanta', u'\xbfSaben cu\xe1nta'), (u'\xbfSaben cuantos', u'\xbfSaben cu\xe1ntos'), (u'\xbfSaben cuantas', u'\xbfSaben cu\xe1ntas'), (u'\xbfSaben cuando', u'\xbfSaben cu\xe1ndo'), (u'\xbfSaben donde', u'\xbfSaben d\xf3nde'), (u'\xbfDe que', u'\xbfDe qu\xe9'), (u'\xbfDe donde', u'\xbfDe d\xf3nde'), (u'\xbfDe cual', u'\xbfDe cu\xe1l'), (u'\xbfDe quien', u'\xbfDe qui\xe9n'), (u'\xbfDe cuanto', u'\xbfDe cu\xe1nto'), (u'\xbfDe cuanta', u'\xbfDe cu\xe1nta'), (u'\xbfDe cuantos', u'\xbfDe cu\xe1ntos'), (u'\xbfDe cuantas', u'\xbfDe cu\xe1ntas'), (u'\xbfDe cuando', u'\xbfDe cu\xe1ndo'), (u'\xbfDesde que', u'\xbfDesde qu\xe9'), (u'\xbfDesde cuando', u'\xbfDesde cu\xe1ndo'), (u'\xbfDesde quien', u'\xbfDesde qui\xe9n'), (u'\xbfDesde donde', u'\xbfDesde d\xf3nde'), (u'\xbfSobre que', u'\xbfSobre qu\xe9'), (u'\xbfComo ', u'\xbfC\xf3mo '), (u'\xbfCual ', u'\xbfCu\xe1l '), (u'\xbfEn cual', u'\xbfEn cu\xe1l'), (u'\xbfCuando', u'\xbfCu\xe1ndo'), (u'\xbfHasta cual', u'\xbfHasta cu\xe1l'), (u'\xbfHasta quien', u'\xbfHasta qui\xe9n'), (u'\xbfHasta cuanto', u'\xbfHasta cu\xe1nto'), (u'\xbfHasta cuantas', u'\xbfHasta cu\xe1ntas'), (u'\xbfHasta cuantos', u'\xbfHasta cu\xe1ntos'), (u'\xbfHasta cuando', u'\xbfHasta cu\xe1ndo'), (u'\xbfHasta donde', u'\xbfHasta d\xf3nde'), (u'\xbfHasta que', u'\xbfHasta qu\xe9'), (u'\xbfHasta adonde', u'\xbfHasta ad\xf3nde'), (u'\xbfCuanto', u'\xbfCu\xe1nto'), (u'\xbfCuantos', u'\xbfCu\xe1ntos'), (u'\xbfDonde', u'\xbfD\xf3nde'), (u'\xbfAdonde', u'\xbfAd\xf3nde'), (u'\xbfCon que', u'\xbfCon qu\xe9'), (u'\xbfCon cual', u'\xbfCon cu\xe1l'), (u'\xbfCon quien', u'\xbfCon qui\xe9n'), (u'\xbfCon cuantos', u'\xbfCon cu\xe1ntos'), (u'\xbfCon cuanta', u'\xbfCon cu\xe1nta'), (u'\xbfCon cuanto', u'\xbfCon cu\xe1nto'), (u'\xbfPara donde', u'\xbfPara d\xf3nde'), (u'\xbfPara adonde', u'\xbfPara ad\xf3nde'), (u'\xbfPara cuando', u'\xbfPara cu\xe1ndo'), (u'\xbfPara que', u'\xbfPara qu\xe9'), (u'\xbfPara quien', u'\xbfPara qui\xe9n'), (u'\xbfPara cuanto', u'\xbfPara cu\xe1nto'), (u'\xbfPara cuanta', u'\xbfPara cu\xe1nta'), (u'\xbfPara cuantos', u'\xbfPara cu\xe1ntos'), (u'\xbfPara cuantas', u'\xbfPara cu\xe1ntas'), (u'\xbfA donde', u'\xbfA d\xf3nde'), (u'\xbfA que', u'\xbfA qu\xe9'), (u'\xbfA cual', u'\xbfA cu\xe1l'), (u'\xbfA quien', u'\xbfA quien'), (u'\xbfA como', u'\xbfA c\xf3mo'), (u'\xbfA cuanto', u'\xbfA cu\xe1nto'), (u'\xbfA cuanta', u'\xbfA cu\xe1nta'), (u'\xbfA cuantos', u'\xbfA cu\xe1ntos'), (u'\xbfA cuantas', u'\xbfA cu\xe1ntas'), (u'\xbfPor que', u'\xbfPor qu\xe9'), (u'\xbfPor cual', u'\xbfPor cu\xe1l'), (u'\xbfPor quien', u'\xbfPor qui\xe9n'), (u'\xbfPor cuanto', u'\xbfPor cu\xe1nto'), (u'\xbfPor cuanta', u'\xbfPor cu\xe1nta'), (u'\xbfPor cuantos', u'\xbfPor cu\xe1ntos'), (u'\xbfPor cuantas', u'\xbfPor cu\xe1ntas'), (u'\xbfPor donde', u'\xbfPor d\xf3nde'), (u'\xbfPorque', u'\xbfPor qu\xe9'), (u'\xbfPorqu\xe9', u'\xbfPor qu\xe9'), (u'\xbfY que', u'\xbfY qu\xe9'), (u'\xbfY como', u'\xbfY c\xf3mo'), (u'\xbfY cuando', u'\xbfY cu\xe1ndo'), (u'\xbfY cual', u'\xbfY cu\xe1l'), (u'\xbfY quien', u'\xbfY qui\xe9n'), (u'\xbfY cuanto', u'\xbfY cu\xe1nto'), (u'\xbfY cuanta', u'\xbfY cu\xe1nta'), (u'\xbfY cuantos', u'\xbfY cu\xe1ntos'), (u'\xbfY cuantas', u'\xbfY cu\xe1ntas'), (u'\xbfY donde', u'\xbfY d\xf3nde'), (u'\xbfY adonde', u'\xbfY ad\xf3nde'), (u'\xbfQuien ', u'\xbfQui\xe9n '), (u'\xbfEsta ', u'\xbfEst\xe1 '), (u'el porque', u'el porqu\xe9'), (u'su porque', u'su porqu\xe9'), (u'los porqu\xe9s', u'los porqu\xe9s'), (u'aun,', u'a\xfan,'), (u'aun no', u'a\xfan no'), (u' de y ', u' d\xe9 y '), (u' nos de ', u' nos d\xe9 '), (u' tu ya ', u' t\xfa ya '), (u'Tu ya ', u'T\xfa ya '), (u' de, ', u' d\xe9,'), (u' mi, ', u' m\xed,'), (u' tu, ', u' t\xfa,'), (u' el, ', u' \xe9l,'), (u' te, ', u' t\xe9,'), (u' mas, ', u' m\xe1s,'), (u' quien, ', u' qui\xe9n,'), (u' cual,', u' cu\xe1l,'), (u'porque, ', u'porqu\xe9,'), (u'cuanto, ', u'cu\xe1nto,'), (u'cuando, ', u'cu\xe1ndo,'), (u' se,', u' s\xe9,'), (u'se donde', u's\xe9 d\xf3nde'), (u'se cuando', u's\xe9 cu\xe1ndo'), (u'se adonde', u's\xe9 ad\xf3nde'), (u'se como', u's\xe9 c\xf3mo'), (u'se cual', u's\xe9 cu\xe1l'), (u'se quien', u's\xe9 qui\xe9n'), (u'se cuanto', u's\xe9 cu\xe1nto'), (u'se cuanta', u's\xe9 cu\xe1nta'), (u'se cuantos', u's\xe9 cu\xe1ntos'), (u'se cuantas', u's\xe9 cu\xe1ntas'), (u'se cuan', u's\xe9 cu\xe1n'), (u' el si ', u' el s\xed '), (u'si mismo', u's\xed mismo'), (u'si misma', u's\xed misma'), (u' llegal', u' ilegal'), (u' lluminar', u' iluminar'), (u'sllbato', u'silbato'), (u'sllenclo', u'silencio'), (u'clemencla', u'clemencia'), (u'socledad', u'sociedad'), (u'tlene', u'tiene'), (u'tlempo', u'tiempo'), (u'equlvocaba', u'equivocaba'), (u'qulnce', u'quince'), (u'comlen', u'comien'), (u'historl', u'histori'), (u'misterl', u'misteri'), (u'vivencl', u'vivenci')]), 'pattern': u'(?um)(?:(?<=\\s)|(?<=^)|(?<=\\b))(?:de\\ gratis|si\\ quiera|Cada\\ una\\ de\\ los|Cada\\ uno\\ de\\ las|haber\\ que|haber\\ qu\\\xe9|Haber\\ si|\\ que\\ hora|yo\\ que\\ se|Yo\\ que\\ se|\\ tu\\!|\\ si\\!|\\ mi\\!|\\ el\\!|\\ tu\\?|\\ si\\?|\\ mi\\?|\\ el\\?|\\ aun\\?|\\ mas\\?|\\ que\\?|\\ paso\\?|\\ cuando\\?|\\ cuanto\\?|\\ cuanta\\?|\\ cuantas\\?|\\ cuantos\\?|\\ donde\\?|\\ quien\\?|\\ como\\?|\\ adonde\\?|\\ cual\\?|\\\xbfSi\\?|\\\xbfesta\\ bien\\?|\\\xbfPero\\ qu\\\xe9\\ haces\\?|\\\xbfpero\\ qu\\\xe9\\ haces\\?|\\\xbfEs\\ que\\ no\\ me\\ has\\ escuchado\\?|\\\xa1\\\xbfes\\ que\\ no\\ me\\ has\\ escuchado\\?\\!|\\\xbfaun|\\\xbftu\\ |\\\xbfque\\ |\\\xbfsabes\\ que|\\\xbfsabes\\ adonde|\\\xbfsabes\\ cual|\\\xbfsabes\\ quien|\\\xbfsabes\\ como|\\\xbfsabes\\ cuan|\\\xbfsabes\\ cuanto|\\\xbfsabes\\ cuanta|\\\xbfsabes\\ cuantos|\\\xbfsabes\\ cuantas|\\\xbfsabes\\ cuando|\\\xbfsabes\\ donde|\\\xbfsabe\\ que|\\\xbfsabe\\ adonde|\\\xbfsabe\\ cual|\\\xbfsabe\\ quien|\\\xbfsabe\\ como|\\\xbfsabe\\ cuan|\\\xbfsabe\\ cuanto|\\\xbfsabe\\ cuanta|\\\xbfsabe\\ cuantos|\\\xbfsabe\\ cuantas|\\\xbfsabe\\ cuando|\\\xbfsabe\\ donde|\\\xbfsaben\\ que|\\\xbfsaben\\ adonde|\\\xbfsaben\\ cual|\\\xbfsaben\\ quien|\\\xbfsaben\\ como|\\\xbfsaben\\ cuan|\\\xbfsaben\\ cuanto|\\\xbfsaben\\ cuanta|\\\xbfsaben\\ cuantos|\\\xbfsaben\\ cuantas|\\\xbfsaben\\ cuando|\\\xbfsaben\\ donde|\\\xbfde\\ que|\\\xbfde\\ donde|\\\xbfde\\ cual|\\\xbfde\\ quien|\\\xbfde\\ cuanto|\\\xbfde\\ cuanta|\\\xbfde\\ cuantos|\\\xbfde\\ cuantas|\\\xbfde\\ cuando|\\\xbfsobre\\ que|\\\xbfcomo\\ |\\\xbfcual\\ |\\\xbfen\\ cual|\\\xbfcuando|\\\xbfhasta\\ cual|\\\xbfhasta\\ quien|\\\xbfhasta\\ cuanto|\\\xbfhasta\\ cuantas|\\\xbfhasta\\ cuantos|\\\xbfhasta\\ cuando|\\\xbfhasta\\ donde|\\\xbfhasta\\ que|\\\xbfhasta\\ adonde|\\\xbfdesde\\ que|\\\xbfdesde\\ cuando|\\\xbfdesde\\ quien|\\\xbfdesde\\ donde|\\\xbfcuanto|\\\xbfcuantos|\\\xbfdonde|\\\xbfadonde|\\\xbfcon\\ que|\\\xbfcon\\ cual|\\\xbfcon\\ quien|\\\xbfcon\\ cuantos|\\\xbfcon\\ cuantas|\\\xbfcon\\ cuanta|\\\xbfcon\\ cuanto|\\\xbfpara\\ donde|\\\xbfpara\\ adonde|\\\xbfpara\\ cuando|\\\xbfpara\\ que|\\\xbfpara\\ quien|\\\xbfpara\\ cuanto|\\\xbfpara\\ cuanta|\\\xbfpara\\ cuantos|\\\xbfpara\\ cuantas|\\\xbfa\\ donde|\\\xbfa\\ que|\\\xbfa\\ cual|\\\xbfa\\ quien|\\\xbfa\\ como|\\\xbfa\\ cuanto|\\\xbfa\\ cuanta|\\\xbfa\\ cuantos|\\\xbfa\\ cuantas|\\\xbfpor\\ que|\\\xbfpor\\ cual|\\\xbfpor\\ quien|\\\xbfpor\\ cuanto|\\\xbfpor\\ cuanta|\\\xbfpor\\ cuantos|\\\xbfpor\\ cuantas|\\\xbfpor\\ donde|\\\xbfporque|\\\xbfporqu\\\xe9|\\\xbfy\\ que|\\\xbfy\\ como|\\\xbfy\\ cuando|\\\xbfy\\ cual|\\\xbfy\\ quien|\\\xbfy\\ cuanto|\\\xbfy\\ cuanta|\\\xbfy\\ cuantos|\\\xbfy\\ cuantas|\\\xbfy\\ donde|\\\xbfy\\ adonde|\\\xbfquien\\ |\\\xbfesta\\ |\\\xbfestas\\ |\\\xbfAun|\\\xbfQue\\ |\\\xbfSabes\\ que|\\\xbfSabes\\ adonde|\\\xbfSabes\\ cual|\\\xbfSabes\\ quien|\\\xbfSabes\\ como|\\\xbfSabes\\ cuan|\\\xbfSabes\\ cuanto|\\\xbfSabes\\ cuanta|\\\xbfSabes\\ cuantos|\\\xbfSabes\\ cuantas|\\\xbfSabes\\ cuando|\\\xbfSabes\\ donde|\\\xbfSabe\\ que|\\\xbfSabe\\ adonde|\\\xbfSabe\\ cual|\\\xbfSabe\\ quien|\\\xbfSabe\\ como|\\\xbfSabe\\ cuan|\\\xbfSabe\\ cuanto|\\\xbfSabe\\ cuanta|\\\xbfSabe\\ cuantos|\\\xbfSabe\\ cuantas|\\\xbfSabe\\ cuando|\\\xbfSabe\\ donde|\\\xbfSaben\\ que|\\\xbfSaben\\ adonde|\\\xbfSaben\\ cual|\\\xbfSaben\\ quien|\\\xbfSaben\\ como|\\\xbfSaben\\ cuan|\\\xbfSaben\\ cuanto|\\\xbfSaben\\ cuanta|\\\xbfSaben\\ cuantos|\\\xbfSaben\\ cuantas|\\\xbfSaben\\ cuando|\\\xbfSaben\\ donde|\\\xbfDe\\ que|\\\xbfDe\\ donde|\\\xbfDe\\ cual|\\\xbfDe\\ quien|\\\xbfDe\\ cuanto|\\\xbfDe\\ cuanta|\\\xbfDe\\ cuantos|\\\xbfDe\\ cuantas|\\\xbfDe\\ cuando|\\\xbfDesde\\ que|\\\xbfDesde\\ cuando|\\\xbfDesde\\ quien|\\\xbfDesde\\ donde|\\\xbfSobre\\ que|\\\xbfComo\\ |\\\xbfCual\\ |\\\xbfEn\\ cual|\\\xbfCuando|\\\xbfHasta\\ cual|\\\xbfHasta\\ quien|\\\xbfHasta\\ cuanto|\\\xbfHasta\\ cuantas|\\\xbfHasta\\ cuantos|\\\xbfHasta\\ cuando|\\\xbfHasta\\ donde|\\\xbfHasta\\ que|\\\xbfHasta\\ adonde|\\\xbfCuanto|\\\xbfCuantos|\\\xbfDonde|\\\xbfAdonde|\\\xbfCon\\ que|\\\xbfCon\\ cual|\\\xbfCon\\ quien|\\\xbfCon\\ cuantos|\\\xbfCon\\ cuanta|\\\xbfCon\\ cuanto|\\\xbfPara\\ donde|\\\xbfPara\\ adonde|\\\xbfPara\\ cuando|\\\xbfPara\\ que|\\\xbfPara\\ quien|\\\xbfPara\\ cuanto|\\\xbfPara\\ cuanta|\\\xbfPara\\ cuantos|\\\xbfPara\\ cuantas|\\\xbfA\\ donde|\\\xbfA\\ que|\\\xbfA\\ cual|\\\xbfA\\ quien|\\\xbfA\\ como|\\\xbfA\\ cuanto|\\\xbfA\\ cuanta|\\\xbfA\\ cuantos|\\\xbfA\\ cuantas|\\\xbfPor\\ que|\\\xbfPor\\ cual|\\\xbfPor\\ quien|\\\xbfPor\\ cuanto|\\\xbfPor\\ cuanta|\\\xbfPor\\ cuantos|\\\xbfPor\\ cuantas|\\\xbfPor\\ donde|\\\xbfPorque|\\\xbfPorqu\\\xe9|\\\xbfY\\ que|\\\xbfY\\ como|\\\xbfY\\ cuando|\\\xbfY\\ cual|\\\xbfY\\ quien|\\\xbfY\\ cuanto|\\\xbfY\\ cuanta|\\\xbfY\\ cuantos|\\\xbfY\\ cuantas|\\\xbfY\\ donde|\\\xbfY\\ adonde|\\\xbfQuien\\ |\\\xbfEsta\\ |el\\ porque|su\\ porque|los\\ porqu\\\xe9s|aun\\,|aun\\ no|\\ de\\ y\\ |\\ nos\\ de\\ |\\ tu\\ ya\\ |Tu\\ ya\\ |\\ de\\,\\ |\\ mi\\,\\ |\\ tu\\,\\ |\\ el\\,\\ |\\ te\\,\\ |\\ mas\\,\\ |\\ quien\\,\\ |\\ cual\\,|porque\\,\\ |cuanto\\,\\ |cuando\\,\\ |\\ se\\,|se\\ donde|se\\ cuando|se\\ adonde|se\\ como|se\\ cual|se\\ quien|se\\ cuanto|se\\ cuanta|se\\ cuantos|se\\ cuantas|se\\ cuan|\\ el\\ si\\ |si\\ mismo|si\\ misma|\\ llegal|\\ lluminar|sllbato|sllenclo|clemencla|socledad|tlene|tlempo|equlvocaba|qulnce|comlen|historl|misterl|vivencl)(?:(?=\\s)|(?=$)|(?=\\b))'}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict(), 'pattern': None}, 'WholeLines': {'data': OrderedDict([(u'No', u'No.')]), 'pattern': None}, @@ -174,7 +174,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict([(u'bi smo', u'bismo'), (u'dali je', u'da li je'), (u'dali si', u'da li si'), (u'Dali si', u'Da li si'), (u'Jel sam ti', u'Jesam li ti'), (u'Jel si', u'Jesi li'), (u"Jel' si", u'Jesi li'), (u"Je I'", u'Jesi li'), (u'Jel si to', u'Jesi li to'), (u"Jel' si to", u'Da li si to'), (u'jel si to', u'da li si to'), (u"jel' si to", u'jesi li to'), (u'Jel si ti', u'Da li si ti'), (u"Jel' si ti", u'Da li si ti'), (u'jel si ti', u'da li si ti'), (u"jel' si ti", u'da li si ti'), (u'jel ste ', u'jeste li '), (u'Jel ste', u'Jeste li'), (u"jel' ste ", u'jeste li '), (u"Jel' ste ", u'Jeste li '), (u'Jel su ', u'Jesu li '), (u'Jel da ', u'Zar ne'), (u'jel da ', u'zar ne'), (u"jel'da ", u'zar ne'), (u'Jeli sve ', u'Je li sve'), (u'Jeli on ', u'Je li on'), (u'Jeli ti ', u'Je li ti'), (u'jeli ti ', u'je li ti'), (u'Jeli to ', u'Je li to'), (u'Nebrini', u'Ne brini'), (u'ne \u0107u', u'ne\u0107u'), (u'od kako', u'otkako'), (u'Si dobro', u'Jesi li dobro'), (u'Svo vreme', u'Sve vrijeme'), (u'Svo vrijeme', u'Sve vrijeme'), (u'Cijelo vrijeme', u'Sve vrijeme')]), 'pattern': u"(?um)(?:(?<=\\s)|(?<=^)|(?<=\\b))(?:bi\\ smo|dali\\ je|dali\\ si|Dali\\ si|Jel\\ sam\\ ti|Jel\\ si|Jel\\'\\ si|Je\\ I\\'|Jel\\ si\\ to|Jel\\'\\ si\\ to|jel\\ si\\ to|jel\\'\\ si\\ to|Jel\\ si\\ ti|Jel\\'\\ si\\ ti|jel\\ si\\ ti|jel\\'\\ si\\ ti|jel\\ ste\\ |Jel\\ ste|jel\\'\\ ste\\ |Jel\\'\\ ste\\ |Jel\\ su\\ |Jel\\ da\\ |jel\\ da\\ |jel\\'da\\ |Jeli\\ sve\\ |Jeli\\ on\\ |Jeli\\ ti\\ |jeli\\ ti\\ |Jeli\\ to\\ |Nebrini|ne\\ \\\u0107u|od\\ kako|Si\\ dobro|Svo\\ vreme|Svo\\ vrijeme|Cijelo\\ vrijeme)(?:(?=\\s)|(?=$)|(?=\\b))"}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict(), 'pattern': None}, 'WholeLines': {'data': OrderedDict(), 'pattern': None}, @@ -186,7 +186,7 @@ data = {'bos': {'BeginLines': {'data': OrderedDict(), 'pattern': None}, 'PartialLines': {'data': OrderedDict(), 'pattern': None}, - 'PartialWordsAlways': {'data': OrderedDict([(u'\u0139', u'\xc5'), (u'\u013a', u'\xe5'), (u'\xb6\xb6', u'\u266b'), (u'\xb6', u'\u266a')]), + 'PartialWordsAlways': {'data': OrderedDict([(u'\u0139', u'\xc5'), (u'\u013a', u'\xe5')]), 'pattern': None}, 'WholeLines': {'data': OrderedDict(), 'pattern': None}, diff --git a/libs/subzero/modification/dictionaries/make_data.py b/libs/subzero/modification/dictionaries/make_data.py index 1ac99b6e6..f6e6ac048 100644 --- a/libs/subzero/modification/dictionaries/make_data.py +++ b/libs/subzero/modification/dictionaries/make_data.py @@ -117,10 +117,6 @@ SZ_FIX_DATA = { } SZ_FIX_DATA_GLOBAL = { - "PartialWordsAlways": { - u"¶¶": u"♫", - u"¶": u"♪" - } } if __name__ == "__main__": diff --git a/libs/subzero/modification/mods/common.py b/libs/subzero/modification/mods/common.py index 896a1f2f8..b1d83c703 100644 --- a/libs/subzero/modification/mods/common.py +++ b/libs/subzero/modification/mods/common.py @@ -33,11 +33,10 @@ class CommonFixes(SubtitleTextModification): # line = : text NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"), - # multi space - NReProcessor(re.compile(r'(?u)(\s{2,})'), " ", name="CM_multi_space"), - # fix music symbols - NReProcessor(re.compile(ur'(?u)(?:^[-\s]*[*#¶]+(?![^\s\-*#¶]))|(?:[*#¶]+\s*$)'), u"♪", name="CM_music_symbols"), + NReProcessor(re.compile(ur'(?u)(^[-\s]*[*#¶]+\s*)|(\s*[*#¶]+\s*$)'), + lambda x: u"♪ " if x.group(1) else u" ♪", + name="CM_music_symbols"), # '' = " NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"), diff --git a/libs/subzero/video.py b/libs/subzero/video.py index cb5b8d172..fc6dc99de 100644 --- a/libs/subzero/video.py +++ b/libs/subzero/video.py @@ -17,17 +17,23 @@ def has_external_subtitle(part_id, stored_subs, language): def set_existing_languages(video, video_info, external_subtitles=False, embedded_subtitles=False, known_embedded=None, - stored_subs=None, languages=None, only_one=False): + stored_subs=None, languages=None, only_one=False, known_metadata_subs=None): logger.debug(u"Determining existing subtitles for %s", video.name) + external_langs_found = set() # scan for external subtitles - external_langs_found = set(search_external_subtitles(video.name, languages=languages, - only_one=only_one).values()) + if known_metadata_subs: + # existing metadata subtitles + external_langs_found = known_metadata_subs + + external_langs_found.update(set(search_external_subtitles(video.name, languages=languages, + only_one=only_one).values())) # found external subtitles should be considered? if external_subtitles: # |= is update, thanks plex video.subtitle_languages.update(external_langs_found) + video.external_subtitle_languages.update(external_langs_found) else: # did we already download subtitles for this?