diff --git a/libs/subzero/modification/dictionaries/make_data.py b/libs/subzero/modification/dictionaries/make_data.py index 726208d3d..4611b3c9c 100644 --- a/libs/subzero/modification/dictionaries/make_data.py +++ b/libs/subzero/modification/dictionaries/make_data.py @@ -1,6 +1,7 @@ # coding=utf-8 - from __future__ import absolute_import +from __future__ import unicode_literals + import re import os import pprint @@ -25,95 +26,95 @@ for lang, grps in data.iteritems(): SZ_FIX_DATA = { "eng": { "PartialWordsAlways": { - u"°x°": u"%", - u"compiete": u"complete", - u"Âs": u"'s", - u"ÃÂs": u"'s", - u"a/ion": u"ation", - u"at/on": u"ation", - u"l/an": u"lian", - u"lljust": u"ll just", - u" L ": u" I ", - u" l ": u" I ", - u"'sjust": u"'s just", - u"'tjust": u"'t just", - u"\";": u"'s", + "°x°": "%", + "compiete": "complete", + "Âs": "'s", + "ÃÂs": "'s", + "a/ion": "ation", + "at/on": "ation", + "l/an": "lian", + "lljust": "ll just", + " L ": " I ", + " l ": " I ", + "'sjust": "'s just", + "'tjust": "'t just", + "\";": "'s", }, "WholeWords": { - u"I'11": u"I'll", - u"III'll": u"I'll", - u"Tun": u"Run", - u"pan'": u"part", - u"al'": u"at", - u"a re": u"are", - u"wail'": u"wait", - u"he)'": u"hey", - u"he)\"": u"hey", - u"He)'": u"Hey", - u"He)\"": u"Hey", - u"He)’": u"Hey", - u"Yea h": u"Yeah", - u"yea h": u"yeah", - u"h is": u"his", - u" 're ": u"'re ", - u"LAst": u"Last", - u"forthis": u"for this", - u"Ls": u"Is", - u"Iam": u"I am", - u"Ican": u"I can", + "I'11": "I'll", + "III'll": "I'll", + "Tun": "Run", + "pan'": "part", + "al'": "at", + "a re": "are", + "wail'": "wait", + "he)'": "hey", + "he)\"": "hey", + "He)'": "Hey", + "He)\"": "Hey", + "He)’": "Hey", + "Yea h": "Yeah", + "yea h": "yeah", + "h is": "his", + " 're ": "'re ", + "LAst": "Last", + "forthis": "for this", + "Ls": "Is", + "Iam": "I am", + "Ican": "I can", }, "PartialLines": { - u"L know": u"I know", - u"L should": u"I should", - u"L do": u"I do", - u"L would": u"I would", - u"L could": u"I could", - u"L can": u"I can", - u"L happen": u"I happen", - u"L might": u"I might", - u"L have ": u"I have", - u"L had": u"I had", - u"L want": u"I want", - u"L was": u"I was", - u"L am": u"I am", - u"L will": u"I will", - u"L suggest": u"I suggest", - u"L think": u"I think", - u"L reckon": u"I reckon", - u"L like": u"I like", - u"L love": u"I love", - u"L don't": u"I don't", - u"L didn't": u"I didn't", - u"L wasn't": u"I wasnt't", - u"L haven't": u"I haven't", - u"L couldn't": u"I couldn't", - u"L won't": u"I won't", - u"H i": u"Hi", + "L know": "I know", + "L should": "I should", + "L do": "I do", + "L would": "I would", + "L could": "I could", + "L can": "I can", + "L happen": "I happen", + "L might": "I might", + "L have ": "I have", + "L had": "I had", + "L want": "I want", + "L was": "I was", + "L am": "I am", + "L will": "I will", + "L suggest": "I suggest", + "L think": "I think", + "L reckon": "I reckon", + "L like": "I like", + "L love": "I love", + "L don't": "I don't", + "L didn't": "I didn't", + "L wasn't": "I wasnt't", + "L haven't": "I haven't", + "L couldn't": "I couldn't", + "L won't": "I won't", + "H i": "Hi", }, "BeginLines": { - u"l ": u"I ", - u"L ": u"I ", + "l ": "I ", + "L ": "I ", } }, "nld": { "PartialWordsAlways": { - u"ט": u"è", - u"י": u"é", - u"כ": u"ë", - u"צ": u"ë", - u"ן": u"ï", - u"ף": u"ó", - u"א": u"à", - u"Iֻ": u"I", - u"č": u"è", - u"פ": u"o", - u"ם": u"i", + "ט": "è", + "י": "é", + "כ": "ë", + "צ": "ë", + "ן": "ï", + "ף": "ó", + "א": "à", + "Iֻ": "I", + "č": "è", + "פ": "o", + "ם": "i", }, }, "swe": { "PartialWordsAlways": { - u"ĺ": u"å", - u"Ĺ": u"Å", + "ĺ": "å", + "Ĺ": "Å", } } } @@ -136,16 +137,16 @@ if __name__ == "__main__": fetch_data = ( # group, item_name, pattern ("WholeLines", "Line", None), - ("WholeWords", "Word", lambda d: (ur"(?um)(\b|^)(?:" + u"|".join([re.escape(k) for k in d.keys()]) - + ur')(\b|$)') if d else None), + ("WholeWords", "Word", lambda d: (r"(?um)(\b|^)(?:" + "|".join([re.escape(k) for k in list(d.keys())]) + + r')(\b|$)') if d else None), ("PartialWordsAlways", "WordPart", None), - ("PartialLines", "LinePart", lambda d: (ur"(?um)(?:(?<=\s)|(?<=^)|(?<=\b))(?:" + - u"|".join([re.escape(k) for k in d.keys()]) + - ur")(?:(?=\s)|(?=$)|(?=\b))") if d else None), - ("BeginLines", "Beginning", lambda d: (ur"(?um)^(?:"+u"|".join([re.escape(k) for k in d.keys()]) - + ur')') if d else None), - ("EndLines", "Ending", lambda d: (ur"(?um)(?:" + u"|".join([re.escape(k) for k in d.keys()]) + - ur")$") if d else None,), + ("PartialLines", "LinePart", lambda d: (r"(?um)(?:(?<=\s)|(?<=^)|(?<=\b))(?:" + + "|".join([re.escape(k) for k in list(d.keys())]) + + r")(?:(?=\s)|(?=$)|(?=\b))") if d else None), + ("BeginLines", "Beginning", lambda d: (r"(?um)^(?:"+"|".join([re.escape(k) for k in list(d.keys())]) + + r')') if d else None), + ("EndLines", "Ending", lambda d: (r"(?um)(?:" + "|".join([re.escape(k) for k in list(d.keys())]) + + r")$") if d else None,), ) data[lang] = dict((grp, {"data": OrderedDict(), "pattern": None}) for grp, item_name, pattern in fetch_data)