subzero make_data uses unicode literals. Make it work in both python 2 and 3.

pull/818/head
Michiel van Baak 5 years ago
parent ecefa41c93
commit eda8880357

@ -1,6 +1,7 @@
# coding=utf-8 # coding=utf-8
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import unicode_literals
import re import re
import os import os
import pprint import pprint
@ -25,95 +26,95 @@ for lang, grps in data.iteritems():
SZ_FIX_DATA = { SZ_FIX_DATA = {
"eng": { "eng": {
"PartialWordsAlways": { "PartialWordsAlways": {
u"°x°": u"%", "°x°": "%",
u"compiete": u"complete", "compiete": "complete",
u"Âs": u"'s", "Âs": "'s",
u"ÃÂs": u"'s", "ÃÂs": "'s",
u"a/ion": u"ation", "a/ion": "ation",
u"at/on": u"ation", "at/on": "ation",
u"l/an": u"lian", "l/an": "lian",
u"lljust": u"ll just", "lljust": "ll just",
u" L ": u" I ", " L ": " I ",
u" l ": u" I ", " l ": " I ",
u"'sjust": u"'s just", "'sjust": "'s just",
u"'tjust": u"'t just", "'tjust": "'t just",
u"\";": u"'s", "\";": "'s",
}, },
"WholeWords": { "WholeWords": {
u"I'11": u"I'll", "I'11": "I'll",
u"III'll": u"I'll", "III'll": "I'll",
u"Tun": u"Run", "Tun": "Run",
u"pan'": u"part", "pan'": "part",
u"al'": u"at", "al'": "at",
u"a re": u"are", "a re": "are",
u"wail'": u"wait", "wail'": "wait",
u"he)'": u"hey", "he)'": "hey",
u"he)\"": u"hey", "he)\"": "hey",
u"He)'": u"Hey", "He)'": "Hey",
u"He)\"": u"Hey", "He)\"": "Hey",
u"He)": u"Hey", "He)": "Hey",
u"Yea h": u"Yeah", "Yea h": "Yeah",
u"yea h": u"yeah", "yea h": "yeah",
u"h is": u"his", "h is": "his",
u" 're ": u"'re ", " 're ": "'re ",
u"LAst": u"Last", "LAst": "Last",
u"forthis": u"for this", "forthis": "for this",
u"Ls": u"Is", "Ls": "Is",
u"Iam": u"I am", "Iam": "I am",
u"Ican": u"I can", "Ican": "I can",
}, },
"PartialLines": { "PartialLines": {
u"L know": u"I know", "L know": "I know",
u"L should": u"I should", "L should": "I should",
u"L do": u"I do", "L do": "I do",
u"L would": u"I would", "L would": "I would",
u"L could": u"I could", "L could": "I could",
u"L can": u"I can", "L can": "I can",
u"L happen": u"I happen", "L happen": "I happen",
u"L might": u"I might", "L might": "I might",
u"L have ": u"I have", "L have ": "I have",
u"L had": u"I had", "L had": "I had",
u"L want": u"I want", "L want": "I want",
u"L was": u"I was", "L was": "I was",
u"L am": u"I am", "L am": "I am",
u"L will": u"I will", "L will": "I will",
u"L suggest": u"I suggest", "L suggest": "I suggest",
u"L think": u"I think", "L think": "I think",
u"L reckon": u"I reckon", "L reckon": "I reckon",
u"L like": u"I like", "L like": "I like",
u"L love": u"I love", "L love": "I love",
u"L don't": u"I don't", "L don't": "I don't",
u"L didn't": u"I didn't", "L didn't": "I didn't",
u"L wasn't": u"I wasnt't", "L wasn't": "I wasnt't",
u"L haven't": u"I haven't", "L haven't": "I haven't",
u"L couldn't": u"I couldn't", "L couldn't": "I couldn't",
u"L won't": u"I won't", "L won't": "I won't",
u"H i": u"Hi", "H i": "Hi",
}, },
"BeginLines": { "BeginLines": {
u"l ": u"I ", "l ": "I ",
u"L ": u"I ", "L ": "I ",
} }
}, },
"nld": { "nld": {
"PartialWordsAlways": { "PartialWordsAlways": {
u"ט": u"è", "ט": "è",
u"י": u"é", "י": "é",
u"כ": u"ë", "כ": "ë",
u"צ": u"ë", "צ": "ë",
u"ן": u"ï", "ן": "ï",
u"ף": u"ó", "ף": "ó",
u"א": u"à", "א": "à",
u"": u"I", "": "I",
u"č": u"è", "č": "è",
u"פ": u"o", "פ": "o",
u"ם": u"i", "ם": "i",
}, },
}, },
"swe": { "swe": {
"PartialWordsAlways": { "PartialWordsAlways": {
u"ĺ": u"å", "ĺ": "å",
u"Ĺ": u"Å", "Ĺ": "Å",
} }
} }
} }
@ -136,16 +137,16 @@ if __name__ == "__main__":
fetch_data = ( fetch_data = (
# group, item_name, pattern # group, item_name, pattern
("WholeLines", "Line", None), ("WholeLines", "Line", None),
("WholeWords", "Word", lambda d: (ur"(?um)(\b|^)(?:" + u"|".join([re.escape(k) for k in d.keys()]) ("WholeWords", "Word", lambda d: (r"(?um)(\b|^)(?:" + "|".join([re.escape(k) for k in list(d.keys())])
+ ur')(\b|$)') if d else None), + r')(\b|$)') if d else None),
("PartialWordsAlways", "WordPart", None), ("PartialWordsAlways", "WordPart", None),
("PartialLines", "LinePart", lambda d: (ur"(?um)(?:(?<=\s)|(?<=^)|(?<=\b))(?:" + ("PartialLines", "LinePart", lambda d: (r"(?um)(?:(?<=\s)|(?<=^)|(?<=\b))(?:" +
u"|".join([re.escape(k) for k in d.keys()]) + "|".join([re.escape(k) for k in list(d.keys())]) +
ur")(?:(?=\s)|(?=$)|(?=\b))") if d else None), r")(?:(?=\s)|(?=$)|(?=\b))") if d else None),
("BeginLines", "Beginning", lambda d: (ur"(?um)^(?:"+u"|".join([re.escape(k) for k in d.keys()]) ("BeginLines", "Beginning", lambda d: (r"(?um)^(?:"+"|".join([re.escape(k) for k in list(d.keys())])
+ ur')') if d else None), + r')') if d else None),
("EndLines", "Ending", lambda d: (ur"(?um)(?:" + u"|".join([re.escape(k) for k in d.keys()]) + ("EndLines", "Ending", lambda d: (r"(?um)(?:" + "|".join([re.escape(k) for k in list(d.keys())]) +
ur")$") if d else None,), r")$") if d else None,),
) )
data[lang] = dict((grp, {"data": OrderedDict(), "pattern": None}) for grp, item_name, pattern in fetch_data) data[lang] = dict((grp, {"data": OrderedDict(), "pattern": None}) for grp, item_name, pattern in fetch_data)

Loading…
Cancel
Save