bazarr/libs/subzero/modification/dictionaries/make_data.py

# coding=utf-8

import re
import os
import pprint
from collections import OrderedDict

from bs4 import BeautifulSoup

TEMPLATE = """\
import re
from collections import OrderedDict
data = """

TEMPLATE_END = """\

for lang, grps in data.iteritems():
    for grp in grps.iterkeys():
        if data[lang][grp]["pattern"]:
            data[lang][grp]["pattern"] = re.compile(data[lang][grp]["pattern"])
"""


SZ_FIX_DATA = {
    "eng": {
        "PartialWordsAlways": {
            u"°x°": u"%",
            u"compiete": u"complete",
            u"Âs": u"'s",
            u"ÃÂs": u"'s",
            u"a/ion": u"ation",
            u"at/on": u"ation",
            u"l/an": u"lian",
            u"lljust": u"ll just",
            u" L ": u" I ",
            u" l ": u" I ",
            u"'sjust": u"'s just",
            u"'tjust": u"'t just",
        },
        "WholeWords": {
            u"I'11": u"I'll",
            u"III'll": u"I'll",
            u"Tun": u"Run",
            u"pan'": u"part",
            u"al'": u"at",
            u"a re": u"are",
            u"wail'": u"wait",
            u"he)'": u"hey",
            u"he)\"": u"hey",
            u"He)'": u"Hey",
            u"He)\"": u"Hey",
            u"He)’": u"Hey",
            u"Yea h": u"Yeah",
            u"yea h": u"yeah",
            u"h is": u"his",
            u" 're ": u"'re ",
            u"LAst": u"Last",
            u"forthis": u"for this",
            u"Ls": u"Is",
            u"Iam": u"I am",
            u"Ican": u"I can",
        },
        "PartialLines": {
            u"L know": u"I know",
            u"L should": u"I should",
            u"L do": u"I do",
            u"L would": u"I would",
            u"L could": u"I could",
            u"L can": u"I can",
            u"L happen": u"I happen",
            u"L might": u"I might",
            u"L have ": u"I have",
            u"L had": u"I had",
            u"L want": u"I want",
            u"L was": u"I was",
            u"L am": u"I am",
            u"L will": u"I will",
            u"L suggest": u"I suggest",
            u"L think": u"I think",
            u"L reckon": u"I reckon",
            u"L like": u"I like",
            u"L love": u"I love",
            u"L don't": u"I don't",
            u"L didn't": u"I didn't",
            u"L wasn't": u"I wasnt't",
            u"L haven't": u"I haven't",
            u"L couldn't": u"I couldn't",
            u"L won't": u"I won't",
            u"H i": u"Hi",
        },
        "BeginLines": {
            u"l ": u"I ",
            u"L ": u"I ",
        }
    },
    "nld": {
        "PartialWordsAlways": {
            u"ט": u"è",
            u"י": u"é",
            u"כ": u"ë",
            u"צ": u"ë",
            u"ן": u"ï",
            u"ף": u"ó",
            u"א": u"à",
            u"Iֻ": u"I",
            u"č": u"è",
            u"פ": u"o",
            u"ם": u"i",
        },
    },
    "swe": {
        "PartialWordsAlways": {
            u"ĺ": u"å",
            u"Ĺ": u"Å",
        }
    }
}

SZ_FIX_DATA_GLOBAL = {
}

if __name__ == "__main__":
    cur_dir = os.path.dirname(os.path.realpath(__file__))
    xml_dir = os.path.join(cur_dir, "xml")
    file_list = os.listdir(xml_dir)

    data = {}

    for fn in file_list:
        if fn.endswith("_OCRFixReplaceList.xml"):
            lang = fn.split("_")[0]
            soup = BeautifulSoup(open(os.path.join(xml_dir, fn)), "xml")

            fetch_data = (
                    # group, item_name, pattern
                    ("WholeLines", "Line", None),
                    ("WholeWords", "Word", lambda d: (ur"(?um)(\b|^)(?:" + u"|".join([re.escape(k) for k in d.keys()])
                                                      + ur')(\b|$)') if d else None),
                    ("PartialWordsAlways", "WordPart", None),
                    ("PartialLines", "LinePart", lambda d: (ur"(?um)(?:(?<=\s)|(?<=^)|(?<=\b))(?:" +
                                                            u"|".join([re.escape(k) for k in d.keys()]) +
                                                            ur")(?:(?=\s)|(?=$)|(?=\b))") if d else None),
                    ("BeginLines", "Beginning", lambda d: (ur"(?um)^(?:"+u"|".join([re.escape(k) for k in d.keys()])
                                                           + ur')') if d else None),
                    ("EndLines", "Ending", lambda d: (ur"(?um)(?:" + u"|".join([re.escape(k) for k in d.keys()]) +
                                                      ur")$") if d else None,),
            )

            data[lang] = dict((grp, {"data": OrderedDict(), "pattern": None}) for grp, item_name, pattern in fetch_data)

            for grp, item_name, pattern in fetch_data:
                for grp_data in soup.find_all(grp):
                    for line in grp_data.find_all(item_name):
                        data[lang][grp]["data"][line["from"]] = line["to"]

                # add our own dictionaries
                if lang in SZ_FIX_DATA and grp in SZ_FIX_DATA[lang]:
                    data[lang][grp]["data"].update(SZ_FIX_DATA[lang][grp])

                if grp in SZ_FIX_DATA_GLOBAL:
                    data[lang][grp]["data"].update(SZ_FIX_DATA_GLOBAL[grp])

                if pattern:
                    data[lang][grp]["pattern"] = pattern(data[lang][grp]["data"])

    f = open(os.path.join(cur_dir, "data.py"), "w+")
    f.write(TEMPLATE)
    f.write(pprint.pformat(data, width=1))
    f.write(TEMPLATE_END)
    f.close()