diff --git a/libs/inflect.py b/libs/inflect.py new file mode 100644 index 000000000..be61aa341 --- /dev/null +++ b/libs/inflect.py @@ -0,0 +1,3801 @@ +""" + inflect.py: correctly generate plurals, ordinals, indefinite articles; + convert numbers to words + Copyright (C) 2010 Paul Dyson + + Based upon the Perl module Lingua::EN::Inflect by Damian Conway. + + The original Perl module Lingua::EN::Inflect by Damian Conway is + available from http://search.cpan.org/~dconway/ + + This module can be downloaded at http://pypi.org/project/inflect + +methods: + classical inflect + plural plural_noun plural_verb plural_adj singular_noun no num a an + compare compare_nouns compare_verbs compare_adjs + present_participle + ordinal + number_to_words + join + defnoun defverb defadj defa defan + + INFLECTIONS: classical inflect + plural plural_noun plural_verb plural_adj singular_noun compare + no num a an present_participle + + PLURALS: classical inflect + plural plural_noun plural_verb plural_adj singular_noun no num + compare compare_nouns compare_verbs compare_adjs + + COMPARISONS: classical + compare compare_nouns compare_verbs compare_adjs + + ARTICLES: classical inflect num a an + + NUMERICAL: ordinal number_to_words + + USER_DEFINED: defnoun defverb defadj defa defan + +Exceptions: + UnknownClassicalModeError + BadNumValueError + BadChunkingOptionError + NumOutOfRangeError + BadUserDefinedPatternError + BadRcFileError + BadGenderError + +""" + +from __future__ import unicode_literals + +import ast +import sys +import re + + +class UnknownClassicalModeError(Exception): + pass + + +class BadNumValueError(Exception): + pass + + +class BadChunkingOptionError(Exception): + pass + + +class NumOutOfRangeError(Exception): + pass + + +class BadUserDefinedPatternError(Exception): + pass + + +class BadRcFileError(Exception): + pass + + +class BadGenderError(Exception): + pass + + +__version__ = "2.1.0" + + +STDOUT_ON = False + + +def print3(txt): + if STDOUT_ON: + print(txt) + + +def enclose(s): + return "(?:%s)" % s + + +def joinstem(cutpoint=0, words=""): + """ + join stem of each word in words into a string for regex + each word is truncated at cutpoint + cutpoint is usually negative indicating the number of letters to remove + from the end of each word + + e.g. + joinstem(-2, ["ephemeris", "iris", ".*itis"]) returns + (?:ephemer|ir|.*it) + + """ + return enclose("|".join(w[:cutpoint] for w in words)) + + +def bysize(words): + """ + take a list of words and return a dict of sets sorted by word length + e.g. + ret[3]=set(['ant', 'cat', 'dog', 'pig']) + ret[4]=set(['frog', 'goat']) + ret[5]=set(['horse']) + ret[8]=set(['elephant']) + """ + ret = {} + for w in words: + if len(w) not in ret: + ret[len(w)] = set() + ret[len(w)].add(w) + return ret + + +def make_pl_si_lists(lst, plending, siendingsize, dojoinstem=True): + """ + given a list of singular words: lst + an ending to append to make the plural: plending + the number of characters to remove from the singular + before appending plending: siendingsize + a flag whether to create a joinstem: dojoinstem + + return: + a list of pluralised words: si_list (called si because this is what you need to + look for to make the singular) + the pluralised words as a dict of sets sorted by word length: si_bysize + the singular words as a dict of sets sorted by word length: pl_bysize + if dojoinstem is True: a regular expression that matches any of the stems: stem + """ + if siendingsize is not None: + siendingsize = -siendingsize + si_list = [w[:siendingsize] + plending for w in lst] + pl_bysize = bysize(lst) + si_bysize = bysize(si_list) + if dojoinstem: + stem = joinstem(siendingsize, lst) + return si_list, si_bysize, pl_bysize, stem + else: + return si_list, si_bysize, pl_bysize + + +# 1. PLURALS + +pl_sb_irregular_s = { + "corpus": "corpuses|corpora", + "opus": "opuses|opera", + "genus": "genera", + "mythos": "mythoi", + "penis": "penises|penes", + "testis": "testes", + "atlas": "atlases|atlantes", + "yes": "yeses", +} + +pl_sb_irregular = { + "child": "children", + "brother": "brothers|brethren", + "loaf": "loaves", + "hoof": "hoofs|hooves", + "beef": "beefs|beeves", + "thief": "thiefs|thieves", + "money": "monies", + "mongoose": "mongooses", + "ox": "oxen", + "cow": "cows|kine", + "graffito": "graffiti", + "octopus": "octopuses|octopodes", + "genie": "genies|genii", + "ganglion": "ganglions|ganglia", + "trilby": "trilbys", + "turf": "turfs|turves", + "numen": "numina", + "atman": "atmas", + "occiput": "occiputs|occipita", + "sabretooth": "sabretooths", + "sabertooth": "sabertooths", + "lowlife": "lowlifes", + "flatfoot": "flatfoots", + "tenderfoot": "tenderfoots", + "romany": "romanies", + "jerry": "jerries", + "mary": "maries", + "talouse": "talouses", + "blouse": "blouses", + "rom": "roma", + "carmen": "carmina", +} + +pl_sb_irregular.update(pl_sb_irregular_s) +# pl_sb_irregular_keys = enclose('|'.join(pl_sb_irregular.keys())) + +pl_sb_irregular_caps = { + "Romany": "Romanies", + "Jerry": "Jerrys", + "Mary": "Marys", + "Rom": "Roma", +} + +pl_sb_irregular_compound = {"prima donna": "prima donnas|prime donne"} + +si_sb_irregular = {v: k for (k, v) in pl_sb_irregular.items()} +keys = list(si_sb_irregular.keys()) +for k in keys: + if "|" in k: + k1, k2 = k.split("|") + si_sb_irregular[k1] = si_sb_irregular[k2] = si_sb_irregular[k] + del si_sb_irregular[k] +si_sb_irregular_caps = {v: k for (k, v) in pl_sb_irregular_caps.items()} +si_sb_irregular_compound = {v: k for (k, v) in pl_sb_irregular_compound.items()} +keys = list(si_sb_irregular_compound.keys()) +for k in keys: + if "|" in k: + k1, k2 = k.split("|") + si_sb_irregular_compound[k1] = si_sb_irregular_compound[ + k2 + ] = si_sb_irregular_compound[k] + del si_sb_irregular_compound[k] + +# si_sb_irregular_keys = enclose('|'.join(si_sb_irregular.keys())) + +# Z's that don't double + +pl_sb_z_zes_list = ("quartz", "topaz") +pl_sb_z_zes_bysize = bysize(pl_sb_z_zes_list) + +pl_sb_ze_zes_list = ("snooze",) +pl_sb_ze_zes_bysize = bysize(pl_sb_ze_zes_list) + + +# CLASSICAL "..is" -> "..ides" + +pl_sb_C_is_ides_complete = [ + # GENERAL WORDS... + "ephemeris", + "iris", + "clitoris", + "chrysalis", + "epididymis", +] + +pl_sb_C_is_ides_endings = [ + # INFLAMATIONS... + "itis" +] + +pl_sb_C_is_ides = joinstem( + -2, pl_sb_C_is_ides_complete + [".*%s" % w for w in pl_sb_C_is_ides_endings] +) + +pl_sb_C_is_ides_list = pl_sb_C_is_ides_complete + pl_sb_C_is_ides_endings + +( + si_sb_C_is_ides_list, + si_sb_C_is_ides_bysize, + pl_sb_C_is_ides_bysize, +) = make_pl_si_lists(pl_sb_C_is_ides_list, "ides", 2, dojoinstem=False) + + +# CLASSICAL "..a" -> "..ata" + +pl_sb_C_a_ata_list = ( + "anathema", + "bema", + "carcinoma", + "charisma", + "diploma", + "dogma", + "drama", + "edema", + "enema", + "enigma", + "lemma", + "lymphoma", + "magma", + "melisma", + "miasma", + "oedema", + "sarcoma", + "schema", + "soma", + "stigma", + "stoma", + "trauma", + "gumma", + "pragma", +) + +( + si_sb_C_a_ata_list, + si_sb_C_a_ata_bysize, + pl_sb_C_a_ata_bysize, + pl_sb_C_a_ata, +) = make_pl_si_lists(pl_sb_C_a_ata_list, "ata", 1) + +# UNCONDITIONAL "..a" -> "..ae" + +pl_sb_U_a_ae_list = ("alumna", "alga", "vertebra", "persona") +( + si_sb_U_a_ae_list, + si_sb_U_a_ae_bysize, + pl_sb_U_a_ae_bysize, + pl_sb_U_a_ae, +) = make_pl_si_lists(pl_sb_U_a_ae_list, "e", None) + +# CLASSICAL "..a" -> "..ae" + +pl_sb_C_a_ae_list = ( + "amoeba", + "antenna", + "formula", + "hyperbola", + "medusa", + "nebula", + "parabola", + "abscissa", + "hydra", + "nova", + "lacuna", + "aurora", + "umbra", + "flora", + "fauna", +) +( + si_sb_C_a_ae_list, + si_sb_C_a_ae_bysize, + pl_sb_C_a_ae_bysize, + pl_sb_C_a_ae, +) = make_pl_si_lists(pl_sb_C_a_ae_list, "e", None) + + +# CLASSICAL "..en" -> "..ina" + +pl_sb_C_en_ina_list = ("stamen", "foramen", "lumen") + +( + si_sb_C_en_ina_list, + si_sb_C_en_ina_bysize, + pl_sb_C_en_ina_bysize, + pl_sb_C_en_ina, +) = make_pl_si_lists(pl_sb_C_en_ina_list, "ina", 2) + + +# UNCONDITIONAL "..um" -> "..a" + +pl_sb_U_um_a_list = ( + "bacterium", + "agendum", + "desideratum", + "erratum", + "stratum", + "datum", + "ovum", + "extremum", + "candelabrum", +) +( + si_sb_U_um_a_list, + si_sb_U_um_a_bysize, + pl_sb_U_um_a_bysize, + pl_sb_U_um_a, +) = make_pl_si_lists(pl_sb_U_um_a_list, "a", 2) + +# CLASSICAL "..um" -> "..a" + +pl_sb_C_um_a_list = ( + "maximum", + "minimum", + "momentum", + "optimum", + "quantum", + "cranium", + "curriculum", + "dictum", + "phylum", + "aquarium", + "compendium", + "emporium", + "enconium", + "gymnasium", + "honorarium", + "interregnum", + "lustrum", + "memorandum", + "millennium", + "rostrum", + "spectrum", + "speculum", + "stadium", + "trapezium", + "ultimatum", + "medium", + "vacuum", + "velum", + "consortium", + "arboretum", +) + +( + si_sb_C_um_a_list, + si_sb_C_um_a_bysize, + pl_sb_C_um_a_bysize, + pl_sb_C_um_a, +) = make_pl_si_lists(pl_sb_C_um_a_list, "a", 2) + + +# UNCONDITIONAL "..us" -> "i" + +pl_sb_U_us_i_list = ( + "alumnus", + "alveolus", + "bacillus", + "bronchus", + "locus", + "nucleus", + "stimulus", + "meniscus", + "sarcophagus", +) +( + si_sb_U_us_i_list, + si_sb_U_us_i_bysize, + pl_sb_U_us_i_bysize, + pl_sb_U_us_i, +) = make_pl_si_lists(pl_sb_U_us_i_list, "i", 2) + +# CLASSICAL "..us" -> "..i" + +pl_sb_C_us_i_list = ( + "focus", + "radius", + "genius", + "incubus", + "succubus", + "nimbus", + "fungus", + "nucleolus", + "stylus", + "torus", + "umbilicus", + "uterus", + "hippopotamus", + "cactus", +) + +( + si_sb_C_us_i_list, + si_sb_C_us_i_bysize, + pl_sb_C_us_i_bysize, + pl_sb_C_us_i, +) = make_pl_si_lists(pl_sb_C_us_i_list, "i", 2) + + +# CLASSICAL "..us" -> "..us" (ASSIMILATED 4TH DECLENSION LATIN NOUNS) + +pl_sb_C_us_us = ( + "status", + "apparatus", + "prospectus", + "sinus", + "hiatus", + "impetus", + "plexus", +) +pl_sb_C_us_us_bysize = bysize(pl_sb_C_us_us) + +# UNCONDITIONAL "..on" -> "a" + +pl_sb_U_on_a_list = ( + "criterion", + "perihelion", + "aphelion", + "phenomenon", + "prolegomenon", + "noumenon", + "organon", + "asyndeton", + "hyperbaton", +) +( + si_sb_U_on_a_list, + si_sb_U_on_a_bysize, + pl_sb_U_on_a_bysize, + pl_sb_U_on_a, +) = make_pl_si_lists(pl_sb_U_on_a_list, "a", 2) + +# CLASSICAL "..on" -> "..a" + +pl_sb_C_on_a_list = ("oxymoron",) + +( + si_sb_C_on_a_list, + si_sb_C_on_a_bysize, + pl_sb_C_on_a_bysize, + pl_sb_C_on_a, +) = make_pl_si_lists(pl_sb_C_on_a_list, "a", 2) + + +# CLASSICAL "..o" -> "..i" (BUT NORMALLY -> "..os") + +pl_sb_C_o_i = [ + "solo", + "soprano", + "basso", + "alto", + "contralto", + "tempo", + "piano", + "virtuoso", +] # list not tuple so can concat for pl_sb_U_o_os + +pl_sb_C_o_i_bysize = bysize(pl_sb_C_o_i) +si_sb_C_o_i_bysize = bysize(["%si" % w[:-1] for w in pl_sb_C_o_i]) + +pl_sb_C_o_i_stems = joinstem(-1, pl_sb_C_o_i) + +# ALWAYS "..o" -> "..os" + +pl_sb_U_o_os_complete = {"ado", "ISO", "NATO", "NCO", "NGO", "oto"} +si_sb_U_o_os_complete = {"%ss" % w for w in pl_sb_U_o_os_complete} + + +pl_sb_U_o_os_endings = [ + "aficionado", + "aggro", + "albino", + "allegro", + "ammo", + "Antananarivo", + "archipelago", + "armadillo", + "auto", + "avocado", + "Bamako", + "Barquisimeto", + "bimbo", + "bingo", + "Biro", + "bolero", + "Bolzano", + "bongo", + "Boto", + "burro", + "Cairo", + "canto", + "cappuccino", + "casino", + "cello", + "Chicago", + "Chimango", + "cilantro", + "cochito", + "coco", + "Colombo", + "Colorado", + "commando", + "concertino", + "contango", + "credo", + "crescendo", + "cyano", + "demo", + "ditto", + "Draco", + "dynamo", + "embryo", + "Esperanto", + "espresso", + "euro", + "falsetto", + "Faro", + "fiasco", + "Filipino", + "flamenco", + "furioso", + "generalissimo", + "Gestapo", + "ghetto", + "gigolo", + "gizmo", + "Greensboro", + "gringo", + "Guaiabero", + "guano", + "gumbo", + "gyro", + "hairdo", + "hippo", + "Idaho", + "impetigo", + "inferno", + "info", + "intermezzo", + "intertrigo", + "Iquico", + "jumbo", + "junto", + "Kakapo", + "kilo", + "Kinkimavo", + "Kokako", + "Kosovo", + "Lesotho", + "libero", + "libido", + "libretto", + "lido", + "Lilo", + "limbo", + "limo", + "lineno", + "lingo", + "lino", + "livedo", + "loco", + "logo", + "lumbago", + "macho", + "macro", + "mafioso", + "magneto", + "magnifico", + "Majuro", + "Malabo", + "manifesto", + "Maputo", + "Maracaibo", + "medico", + "memo", + "metro", + "Mexico", + "micro", + "Milano", + "Monaco", + "mono", + "Montenegro", + "Morocco", + "Muqdisho", + "myo", + "neutrino", + "Ningbo", + "octavo", + "oregano", + "Orinoco", + "Orlando", + "Oslo", + "panto", + "Paramaribo", + "Pardusco", + "pedalo", + "photo", + "pimento", + "pinto", + "pleco", + "Pluto", + "pogo", + "polo", + "poncho", + "Porto-Novo", + "Porto", + "pro", + "psycho", + "pueblo", + "quarto", + "Quito", + "rhino", + "risotto", + "rococo", + "rondo", + "Sacramento", + "saddo", + "sago", + "salvo", + "Santiago", + "Sapporo", + "Sarajevo", + "scherzando", + "scherzo", + "silo", + "sirocco", + "sombrero", + "staccato", + "sterno", + "stucco", + "stylo", + "sumo", + "Taiko", + "techno", + "terrazzo", + "testudo", + "timpano", + "tiro", + "tobacco", + "Togo", + "Tokyo", + "torero", + "Torino", + "Toronto", + "torso", + "tremolo", + "typo", + "tyro", + "ufo", + "UNESCO", + "vaquero", + "vermicello", + "verso", + "vibrato", + "violoncello", + "Virgo", + "weirdo", + "WHO", + "WTO", + "Yamoussoukro", + "yo-yo", + "zero", + "Zibo", +] + pl_sb_C_o_i + +pl_sb_U_o_os_bysize = bysize(pl_sb_U_o_os_endings) +si_sb_U_o_os_bysize = bysize(["%ss" % w for w in pl_sb_U_o_os_endings]) + + +# UNCONDITIONAL "..ch" -> "..chs" + +pl_sb_U_ch_chs_list = ("czech", "eunuch", "stomach") + +( + si_sb_U_ch_chs_list, + si_sb_U_ch_chs_bysize, + pl_sb_U_ch_chs_bysize, + pl_sb_U_ch_chs, +) = make_pl_si_lists(pl_sb_U_ch_chs_list, "s", None) + + +# UNCONDITIONAL "..[ei]x" -> "..ices" + +pl_sb_U_ex_ices_list = ("codex", "murex", "silex") +( + si_sb_U_ex_ices_list, + si_sb_U_ex_ices_bysize, + pl_sb_U_ex_ices_bysize, + pl_sb_U_ex_ices, +) = make_pl_si_lists(pl_sb_U_ex_ices_list, "ices", 2) + +pl_sb_U_ix_ices_list = ("radix", "helix") +( + si_sb_U_ix_ices_list, + si_sb_U_ix_ices_bysize, + pl_sb_U_ix_ices_bysize, + pl_sb_U_ix_ices, +) = make_pl_si_lists(pl_sb_U_ix_ices_list, "ices", 2) + +# CLASSICAL "..[ei]x" -> "..ices" + +pl_sb_C_ex_ices_list = ( + "vortex", + "vertex", + "cortex", + "latex", + "pontifex", + "apex", + "index", + "simplex", +) + +( + si_sb_C_ex_ices_list, + si_sb_C_ex_ices_bysize, + pl_sb_C_ex_ices_bysize, + pl_sb_C_ex_ices, +) = make_pl_si_lists(pl_sb_C_ex_ices_list, "ices", 2) + + +pl_sb_C_ix_ices_list = ("appendix",) + +( + si_sb_C_ix_ices_list, + si_sb_C_ix_ices_bysize, + pl_sb_C_ix_ices_bysize, + pl_sb_C_ix_ices, +) = make_pl_si_lists(pl_sb_C_ix_ices_list, "ices", 2) + + +# ARABIC: ".." -> "..i" + +pl_sb_C_i_list = ("afrit", "afreet", "efreet") + +(si_sb_C_i_list, si_sb_C_i_bysize, pl_sb_C_i_bysize, pl_sb_C_i) = make_pl_si_lists( + pl_sb_C_i_list, "i", None +) + + +# HEBREW: ".." -> "..im" + +pl_sb_C_im_list = ("goy", "seraph", "cherub") + +(si_sb_C_im_list, si_sb_C_im_bysize, pl_sb_C_im_bysize, pl_sb_C_im) = make_pl_si_lists( + pl_sb_C_im_list, "im", None +) + + +# UNCONDITIONAL "..man" -> "..mans" + +pl_sb_U_man_mans_list = """ + ataman caiman cayman ceriman + desman dolman farman harman hetman + human leman ottoman shaman talisman +""".split() +pl_sb_U_man_mans_caps_list = """ + Alabaman Bahaman Burman German + Hiroshiman Liman Nakayaman Norman Oklahoman + Panaman Roman Selman Sonaman Tacoman Yakiman + Yokohaman Yuman +""".split() + +( + si_sb_U_man_mans_list, + si_sb_U_man_mans_bysize, + pl_sb_U_man_mans_bysize, +) = make_pl_si_lists(pl_sb_U_man_mans_list, "s", None, dojoinstem=False) +( + si_sb_U_man_mans_caps_list, + si_sb_U_man_mans_caps_bysize, + pl_sb_U_man_mans_caps_bysize, +) = make_pl_si_lists(pl_sb_U_man_mans_caps_list, "s", None, dojoinstem=False) + + +pl_sb_uninflected_s_complete = [ + # PAIRS OR GROUPS SUBSUMED TO A SINGULAR... + "breeches", + "britches", + "pajamas", + "pyjamas", + "clippers", + "gallows", + "hijinks", + "headquarters", + "pliers", + "scissors", + "testes", + "herpes", + "pincers", + "shears", + "proceedings", + "trousers", + # UNASSIMILATED LATIN 4th DECLENSION + "cantus", + "coitus", + "nexus", + # RECENT IMPORTS... + "contretemps", + "corps", + "debris", + "siemens", + # DISEASES + "mumps", + # MISCELLANEOUS OTHERS... + "diabetes", + "jackanapes", + "series", + "species", + "subspecies", + "rabies", + "chassis", + "innings", + "news", + "mews", + "haggis", +] + +pl_sb_uninflected_s_endings = [ + # RECENT IMPORTS... + "ois", + # DISEASES + "measles", +] + +pl_sb_uninflected_s = pl_sb_uninflected_s_complete + [ + ".*%s" % w for w in pl_sb_uninflected_s_endings +] + +pl_sb_uninflected_herd = ( + # DON'T INFLECT IN CLASSICAL MODE, OTHERWISE NORMAL INFLECTION + "wildebeest", + "swine", + "eland", + "bison", + "buffalo", + "elk", + "rhinoceros", + "zucchini", + "caribou", + "dace", + "grouse", + "guinea fowl", + "guinea-fowl", + "haddock", + "hake", + "halibut", + "herring", + "mackerel", + "pickerel", + "pike", + "roe", + "seed", + "shad", + "snipe", + "teal", + "turbot", + "water fowl", + "water-fowl", +) + +pl_sb_uninflected_complete = [ + # SOME FISH AND HERD ANIMALS + "tuna", + "salmon", + "mackerel", + "trout", + "bream", + "sea-bass", + "sea bass", + "carp", + "cod", + "flounder", + "whiting", + "moose", + # OTHER ODDITIES + "graffiti", + "djinn", + "samuri", + "offspring", + "pence", + "quid", + "hertz", +] + pl_sb_uninflected_s_complete +# SOME WORDS ENDING IN ...s (OFTEN PAIRS TAKEN AS A WHOLE) + +pl_sb_uninflected_caps = [ + # ALL NATIONALS ENDING IN -ese + "Portuguese", + "Amoyese", + "Borghese", + "Congoese", + "Faroese", + "Foochowese", + "Genevese", + "Genoese", + "Gilbertese", + "Hottentotese", + "Kiplingese", + "Kongoese", + "Lucchese", + "Maltese", + "Nankingese", + "Niasese", + "Pekingese", + "Piedmontese", + "Pistoiese", + "Sarawakese", + "Shavese", + "Vermontese", + "Wenchowese", + "Yengeese", +] + + +pl_sb_uninflected_endings = [ + # SOME FISH AND HERD ANIMALS + "fish", + "deer", + "sheep", + # ALL NATIONALS ENDING IN -ese + "nese", + "rese", + "lese", + "mese", + # DISEASES + "pox", + # OTHER ODDITIES + "craft", +] + pl_sb_uninflected_s_endings +# SOME WORDS ENDING IN ...s (OFTEN PAIRS TAKEN AS A WHOLE) + + +pl_sb_uninflected_bysize = bysize(pl_sb_uninflected_endings) + + +# SINGULAR WORDS ENDING IN ...s (ALL INFLECT WITH ...es) + +pl_sb_singular_s_complete = [ + "acropolis", + "aegis", + "alias", + "asbestos", + "bathos", + "bias", + "bronchitis", + "bursitis", + "caddis", + "cannabis", + "canvas", + "chaos", + "cosmos", + "dais", + "digitalis", + "epidermis", + "ethos", + "eyas", + "gas", + "glottis", + "hubris", + "ibis", + "lens", + "mantis", + "marquis", + "metropolis", + "pathos", + "pelvis", + "polis", + "rhinoceros", + "sassafras", + "trellis", +] + pl_sb_C_is_ides_complete + + +pl_sb_singular_s_endings = ["ss", "us"] + pl_sb_C_is_ides_endings + +pl_sb_singular_s_bysize = bysize(pl_sb_singular_s_endings) + +si_sb_singular_s_complete = ["%ses" % w for w in pl_sb_singular_s_complete] +si_sb_singular_s_endings = ["%ses" % w for w in pl_sb_singular_s_endings] +si_sb_singular_s_bysize = bysize(si_sb_singular_s_endings) + +pl_sb_singular_s_es = ["[A-Z].*es"] + +pl_sb_singular_s = enclose( + "|".join( + pl_sb_singular_s_complete + + [".*%s" % w for w in pl_sb_singular_s_endings] + + pl_sb_singular_s_es + ) +) + + +# PLURALS ENDING IN uses -> use + + +si_sb_ois_oi_case = ("Bolshois", "Hanois") + +si_sb_uses_use_case = ("Betelgeuses", "Duses", "Meuses", "Syracuses", "Toulouses") + +si_sb_uses_use = ( + "abuses", + "applauses", + "blouses", + "carouses", + "causes", + "chartreuses", + "clauses", + "contuses", + "douses", + "excuses", + "fuses", + "grouses", + "hypotenuses", + "masseuses", + "menopauses", + "misuses", + "muses", + "overuses", + "pauses", + "peruses", + "profuses", + "recluses", + "reuses", + "ruses", + "souses", + "spouses", + "suffuses", + "transfuses", + "uses", +) + +si_sb_ies_ie_case = ( + "Addies", + "Aggies", + "Allies", + "Amies", + "Angies", + "Annies", + "Annmaries", + "Archies", + "Arties", + "Aussies", + "Barbies", + "Barries", + "Basies", + "Bennies", + "Bernies", + "Berties", + "Bessies", + "Betties", + "Billies", + "Blondies", + "Bobbies", + "Bonnies", + "Bowies", + "Brandies", + "Bries", + "Brownies", + "Callies", + "Carnegies", + "Carries", + "Cassies", + "Charlies", + "Cheries", + "Christies", + "Connies", + "Curies", + "Dannies", + "Debbies", + "Dixies", + "Dollies", + "Donnies", + "Drambuies", + "Eddies", + "Effies", + "Ellies", + "Elsies", + "Eries", + "Ernies", + "Essies", + "Eugenies", + "Fannies", + "Flossies", + "Frankies", + "Freddies", + "Gillespies", + "Goldies", + "Gracies", + "Guthries", + "Hallies", + "Hatties", + "Hetties", + "Hollies", + "Jackies", + "Jamies", + "Janies", + "Jannies", + "Jeanies", + "Jeannies", + "Jennies", + "Jessies", + "Jimmies", + "Jodies", + "Johnies", + "Johnnies", + "Josies", + "Julies", + "Kalgoorlies", + "Kathies", + "Katies", + "Kellies", + "Kewpies", + "Kristies", + "Laramies", + "Lassies", + "Lauries", + "Leslies", + "Lessies", + "Lillies", + "Lizzies", + "Lonnies", + "Lories", + "Lorries", + "Lotties", + "Louies", + "Mackenzies", + "Maggies", + "Maisies", + "Mamies", + "Marcies", + "Margies", + "Maries", + "Marjories", + "Matties", + "McKenzies", + "Melanies", + "Mickies", + "Millies", + "Minnies", + "Mollies", + "Mounties", + "Nannies", + "Natalies", + "Nellies", + "Netties", + "Ollies", + "Ozzies", + "Pearlies", + "Pottawatomies", + "Reggies", + "Richies", + "Rickies", + "Robbies", + "Ronnies", + "Rosalies", + "Rosemaries", + "Rosies", + "Roxies", + "Rushdies", + "Ruthies", + "Sadies", + "Sallies", + "Sammies", + "Scotties", + "Selassies", + "Sherries", + "Sophies", + "Stacies", + "Stefanies", + "Stephanies", + "Stevies", + "Susies", + "Sylvies", + "Tammies", + "Terries", + "Tessies", + "Tommies", + "Tracies", + "Trekkies", + "Valaries", + "Valeries", + "Valkyries", + "Vickies", + "Virgies", + "Willies", + "Winnies", + "Wylies", + "Yorkies", +) + +si_sb_ies_ie = ( + "aeries", + "baggies", + "belies", + "biggies", + "birdies", + "bogies", + "bonnies", + "boogies", + "bookies", + "bourgeoisies", + "brownies", + "budgies", + "caddies", + "calories", + "camaraderies", + "cockamamies", + "collies", + "cookies", + "coolies", + "cooties", + "coteries", + "crappies", + "curies", + "cutesies", + "dogies", + "eyrie", + "floozies", + "footsies", + "freebies", + "genies", + "goalies", + "groupies", + "hies", + "jalousies", + "junkies", + "kiddies", + "laddies", + "lassies", + "lies", + "lingeries", + "magpies", + "menageries", + "mommies", + "movies", + "neckties", + "newbies", + "nighties", + "oldies", + "organdies", + "overlies", + "pies", + "pinkies", + "pixies", + "potpies", + "prairies", + "quickies", + "reveries", + "rookies", + "rotisseries", + "softies", + "sorties", + "species", + "stymies", + "sweeties", + "ties", + "underlies", + "unties", + "veggies", + "vies", + "yuppies", + "zombies", +) + + +si_sb_oes_oe_case = ( + "Chloes", + "Crusoes", + "Defoes", + "Faeroes", + "Ivanhoes", + "Joes", + "McEnroes", + "Moes", + "Monroes", + "Noes", + "Poes", + "Roscoes", + "Tahoes", + "Tippecanoes", + "Zoes", +) + +si_sb_oes_oe = ( + "aloes", + "backhoes", + "canoes", + "does", + "floes", + "foes", + "hoes", + "mistletoes", + "oboes", + "pekoes", + "roes", + "sloes", + "throes", + "tiptoes", + "toes", + "woes", +) + +si_sb_z_zes = ("quartzes", "topazes") + +si_sb_zzes_zz = ("buzzes", "fizzes", "frizzes", "razzes") + +si_sb_ches_che_case = ( + "Andromaches", + "Apaches", + "Blanches", + "Comanches", + "Nietzsches", + "Porsches", + "Roches", +) + +si_sb_ches_che = ( + "aches", + "avalanches", + "backaches", + "bellyaches", + "caches", + "cloches", + "creches", + "douches", + "earaches", + "fiches", + "headaches", + "heartaches", + "microfiches", + "niches", + "pastiches", + "psyches", + "quiches", + "stomachaches", + "toothaches", +) + +si_sb_xes_xe = ("annexes", "axes", "deluxes", "pickaxes") + +si_sb_sses_sse_case = ("Hesses", "Jesses", "Larousses", "Matisses") +si_sb_sses_sse = ( + "bouillabaisses", + "crevasses", + "demitasses", + "impasses", + "mousses", + "posses", +) + +si_sb_ves_ve_case = ( + # *[nwl]ives -> [nwl]live + "Clives", + "Palmolives", +) +si_sb_ves_ve = ( + # *[^d]eaves -> eave + "interweaves", + "weaves", + # *[nwl]ives -> [nwl]live + "olives", + # *[eoa]lves -> [eoa]lve + "bivalves", + "dissolves", + "resolves", + "salves", + "twelves", + "valves", +) + + +plverb_special_s = enclose( + "|".join( + [pl_sb_singular_s] + + pl_sb_uninflected_s + + list(pl_sb_irregular_s.keys()) + + ["(.*[csx])is", "(.*)ceps", "[A-Z].*s"] + ) +) + +pl_sb_postfix_adj = { + "general": [r"(?!major|lieutenant|brigadier|adjutant|.*star)\S+"], + "martial": ["court"], + "force": ["pound"], +} + +for k in list(pl_sb_postfix_adj.keys()): + pl_sb_postfix_adj[k] = enclose( + enclose("|".join(pl_sb_postfix_adj[k])) + "(?=(?:-|\\s+)%s)" % k + ) + +pl_sb_postfix_adj_stems = "(" + "|".join(list(pl_sb_postfix_adj.values())) + ")(.*)" + + +# PLURAL WORDS ENDING IS es GO TO SINGULAR is + +si_sb_es_is = ( + "amanuenses", + "amniocenteses", + "analyses", + "antitheses", + "apotheoses", + "arterioscleroses", + "atheroscleroses", + "axes", + # 'bases', # bases -> basis + "catalyses", + "catharses", + "chasses", + "cirrhoses", + "cocces", + "crises", + "diagnoses", + "dialyses", + "diereses", + "electrolyses", + "emphases", + "exegeses", + "geneses", + "halitoses", + "hydrolyses", + "hypnoses", + "hypotheses", + "hystereses", + "metamorphoses", + "metastases", + "misdiagnoses", + "mitoses", + "mononucleoses", + "narcoses", + "necroses", + "nemeses", + "neuroses", + "oases", + "osmoses", + "osteoporoses", + "paralyses", + "parentheses", + "parthenogeneses", + "periphrases", + "photosyntheses", + "probosces", + "prognoses", + "prophylaxes", + "prostheses", + "preces", + "psoriases", + "psychoanalyses", + "psychokineses", + "psychoses", + "scleroses", + "scolioses", + "sepses", + "silicoses", + "symbioses", + "synopses", + "syntheses", + "taxes", + "telekineses", + "theses", + "thromboses", + "tuberculoses", + "urinalyses", +) + +pl_prep_list = """ + about above across after among around at athwart before behind + below beneath beside besides between betwixt beyond but by + during except for from in into near of off on onto out over + since till to under until unto upon with""".split() + +pl_prep_list_da = pl_prep_list + ["de", "du", "da"] + +pl_prep_bysize = bysize(pl_prep_list_da) + +pl_prep = enclose("|".join(pl_prep_list_da)) + +pl_sb_prep_dual_compound = ( + r"(.*?)((?:-|\s+)(?:" + pl_prep + r")(?:-|\s+))a(?:-|\s+)(.*)" +) + + +singular_pronoun_genders = { + "neuter", + "feminine", + "masculine", + "gender-neutral", + "feminine or masculine", + "masculine or feminine", +} + +pl_pron_nom = { + # NOMINATIVE REFLEXIVE + "i": "we", + "myself": "ourselves", + "you": "you", + "yourself": "yourselves", + "she": "they", + "herself": "themselves", + "he": "they", + "himself": "themselves", + "it": "they", + "itself": "themselves", + "they": "they", + "themself": "themselves", + # POSSESSIVE + "mine": "ours", + "yours": "yours", + "hers": "theirs", + "his": "theirs", + "its": "theirs", + "theirs": "theirs", +} + +si_pron = {} +si_pron["nom"] = {v: k for (k, v) in pl_pron_nom.items()} +si_pron["nom"]["we"] = "I" + + +pl_pron_acc = { + # ACCUSATIVE REFLEXIVE + "me": "us", + "myself": "ourselves", + "you": "you", + "yourself": "yourselves", + "her": "them", + "herself": "themselves", + "him": "them", + "himself": "themselves", + "it": "them", + "itself": "themselves", + "them": "them", + "themself": "themselves", +} + +pl_pron_acc_keys = enclose("|".join(list(pl_pron_acc.keys()))) +pl_pron_acc_keys_bysize = bysize(list(pl_pron_acc.keys())) + +si_pron["acc"] = {v: k for (k, v) in pl_pron_acc.items()} + +for thecase, plur, gend, sing in ( + ("nom", "they", "neuter", "it"), + ("nom", "they", "feminine", "she"), + ("nom", "they", "masculine", "he"), + ("nom", "they", "gender-neutral", "they"), + ("nom", "they", "feminine or masculine", "she or he"), + ("nom", "they", "masculine or feminine", "he or she"), + ("nom", "themselves", "neuter", "itself"), + ("nom", "themselves", "feminine", "herself"), + ("nom", "themselves", "masculine", "himself"), + ("nom", "themselves", "gender-neutral", "themself"), + ("nom", "themselves", "feminine or masculine", "herself or himself"), + ("nom", "themselves", "masculine or feminine", "himself or herself"), + ("nom", "theirs", "neuter", "its"), + ("nom", "theirs", "feminine", "hers"), + ("nom", "theirs", "masculine", "his"), + ("nom", "theirs", "gender-neutral", "theirs"), + ("nom", "theirs", "feminine or masculine", "hers or his"), + ("nom", "theirs", "masculine or feminine", "his or hers"), + ("acc", "them", "neuter", "it"), + ("acc", "them", "feminine", "her"), + ("acc", "them", "masculine", "him"), + ("acc", "them", "gender-neutral", "them"), + ("acc", "them", "feminine or masculine", "her or him"), + ("acc", "them", "masculine or feminine", "him or her"), + ("acc", "themselves", "neuter", "itself"), + ("acc", "themselves", "feminine", "herself"), + ("acc", "themselves", "masculine", "himself"), + ("acc", "themselves", "gender-neutral", "themself"), + ("acc", "themselves", "feminine or masculine", "herself or himself"), + ("acc", "themselves", "masculine or feminine", "himself or herself"), +): + try: + si_pron[thecase][plur][gend] = sing + except TypeError: + si_pron[thecase][plur] = {} + si_pron[thecase][plur][gend] = sing + + +si_pron_acc_keys = enclose("|".join(list(si_pron["acc"].keys()))) +si_pron_acc_keys_bysize = bysize(list(si_pron["acc"].keys())) + + +def get_si_pron(thecase, word, gender): + try: + sing = si_pron[thecase][word] + except KeyError: + raise # not a pronoun + try: + return sing[gender] # has several types due to gender + except TypeError: + return sing # answer independent of gender + + +plverb_irregular_pres = { + # 1st PERS. SING. 2ND PERS. SING. 3RD PERS. SINGULAR + # 3RD PERS. (INDET.) + "am": "are", + "are": "are", + "is": "are", + "was": "were", + "were": "were", + "was": "were", + "have": "have", + "have": "have", + "has": "have", + "do": "do", + "do": "do", + "does": "do", +} + +plverb_ambiguous_pres = { + # 1st PERS. SING. 2ND PERS. SING. 3RD PERS. SINGULAR + # 3RD PERS. (INDET.) + "act": "act", + "act": "act", + "acts": "act", + "blame": "blame", + "blame": "blame", + "blames": "blame", + "can": "can", + "can": "can", + "can": "can", + "must": "must", + "must": "must", + "must": "must", + "fly": "fly", + "fly": "fly", + "flies": "fly", + "copy": "copy", + "copy": "copy", + "copies": "copy", + "drink": "drink", + "drink": "drink", + "drinks": "drink", + "fight": "fight", + "fight": "fight", + "fights": "fight", + "fire": "fire", + "fire": "fire", + "fires": "fire", + "like": "like", + "like": "like", + "likes": "like", + "look": "look", + "look": "look", + "looks": "look", + "make": "make", + "make": "make", + "makes": "make", + "reach": "reach", + "reach": "reach", + "reaches": "reach", + "run": "run", + "run": "run", + "runs": "run", + "sink": "sink", + "sink": "sink", + "sinks": "sink", + "sleep": "sleep", + "sleep": "sleep", + "sleeps": "sleep", + "view": "view", + "view": "view", + "views": "view", +} + +plverb_ambiguous_pres_keys = enclose("|".join(list(plverb_ambiguous_pres.keys()))) + + +plverb_irregular_non_pres = ( + "did", + "had", + "ate", + "made", + "put", + "spent", + "fought", + "sank", + "gave", + "sought", + "shall", + "could", + "ought", + "should", +) + +plverb_ambiguous_non_pres = enclose( + "|".join(("thought", "saw", "bent", "will", "might", "cut")) +) + +# "..oes" -> "..oe" (the rest are "..oes" -> "o") + +pl_v_oes_oe = ("canoes", "floes", "oboes", "roes", "throes", "woes") +pl_v_oes_oe_endings_size4 = ("hoes", "toes") +pl_v_oes_oe_endings_size5 = ("shoes",) + + +pl_count_zero = ("0", "no", "zero", "nil") + + +pl_count_one = ("1", "a", "an", "one", "each", "every", "this", "that") + +pl_adj_special = {"a": "some", "an": "some", "this": "these", "that": "those"} + +pl_adj_special_keys = enclose("|".join(list(pl_adj_special.keys()))) + +pl_adj_poss = { + "my": "our", + "your": "your", + "its": "their", + "her": "their", + "his": "their", + "their": "their", +} + +pl_adj_poss_keys = enclose("|".join(list(pl_adj_poss.keys()))) + + +# 2. INDEFINITE ARTICLES + +# THIS PATTERN MATCHES STRINGS OF CAPITALS STARTING WITH A "VOWEL-SOUND" +# CONSONANT FOLLOWED BY ANOTHER CONSONANT, AND WHICH ARE NOT LIKELY +# TO BE REAL WORDS (OH, ALL RIGHT THEN, IT'S JUST MAGIC!) + +A_abbrev = r""" +(?! FJO | [HLMNS]Y. | RY[EO] | SQU + | ( F[LR]? | [HL] | MN? | N | RH? | S[CHKLMNPTVW]? | X(YL)?) [AEIOU]) +[FHLMNRSX][A-Z] +""" + +# THIS PATTERN CODES THE BEGINNINGS OF ALL ENGLISH WORDS BEGINING WITH A +# 'y' FOLLOWED BY A CONSONANT. ANY OTHER Y-CONSONANT PREFIX THEREFORE +# IMPLIES AN ABBREVIATION. + +A_y_cons = "y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)" + +# EXCEPTIONS TO EXCEPTIONS + +A_explicit_a = enclose("|".join(("unabomber", "unanimous", "US"))) + +A_explicit_an = enclose( + "|".join(("euler", "hour(?!i)", "heir", "honest", "hono[ur]", "mpeg")) +) + +A_ordinal_an = enclose("|".join(("[aefhilmnorsx]-?th",))) + +A_ordinal_a = enclose("|".join(("[bcdgjkpqtuvwyz]-?th",))) + + +# NUMERICAL INFLECTIONS + +nth = { + 0: "th", + 1: "st", + 2: "nd", + 3: "rd", + 4: "th", + 5: "th", + 6: "th", + 7: "th", + 8: "th", + 9: "th", + 11: "th", + 12: "th", + 13: "th", +} + +ordinal = dict( + ty="tieth", + one="first", + two="second", + three="third", + five="fifth", + eight="eighth", + nine="ninth", + twelve="twelfth", +) + +ordinal_suff = "|".join(list(ordinal.keys())) + + +# NUMBERS + +unit = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"] +teen = [ + "ten", + "eleven", + "twelve", + "thirteen", + "fourteen", + "fifteen", + "sixteen", + "seventeen", + "eighteen", + "nineteen", +] +ten = [ + "", + "", + "twenty", + "thirty", + "forty", + "fifty", + "sixty", + "seventy", + "eighty", + "ninety", +] +mill = [ + " ", + " thousand", + " million", + " billion", + " trillion", + " quadrillion", + " quintillion", + " sextillion", + " septillion", + " octillion", + " nonillion", + " decillion", +] + + +# SUPPORT CLASSICAL PLURALIZATIONS + +def_classical = dict( + all=False, zero=False, herd=False, names=True, persons=False, ancient=False +) + +all_classical = {k: True for k in list(def_classical.keys())} +no_classical = {k: False for k in list(def_classical.keys())} + + +# Maps strings to built-in constant types +string_to_constant = {"True": True, "False": False, "None": None} + + +class engine: + def __init__(self): + + self.classical_dict = def_classical.copy() + self.persistent_count = None + self.mill_count = 0 + self.pl_sb_user_defined = [] + self.pl_v_user_defined = [] + self.pl_adj_user_defined = [] + self.si_sb_user_defined = [] + self.A_a_user_defined = [] + self.thegender = "neuter" + + deprecated_methods = dict( + pl="plural", + plnoun="plural_noun", + plverb="plural_verb", + pladj="plural_adj", + sinoun="single_noun", + prespart="present_participle", + numwords="number_to_words", + plequal="compare", + plnounequal="compare_nouns", + plverbequal="compare_verbs", + pladjequal="compare_adjs", + wordlist="join", + ) + + def __getattr__(self, meth): + if meth in self.deprecated_methods: + print3( + "{}() deprecated, use {}()".format(meth, self.deprecated_methods[meth]) + ) + raise DeprecationWarning + raise AttributeError + + def defnoun(self, singular, plural): + """ + Set the noun plural of singular to plural. + + """ + self.checkpat(singular) + self.checkpatplural(plural) + self.pl_sb_user_defined.extend((singular, plural)) + self.si_sb_user_defined.extend((plural, singular)) + return 1 + + def defverb(self, s1, p1, s2, p2, s3, p3): + """ + Set the verb plurals for s1, s2 and s3 to p1, p2 and p3 respectively. + + Where 1, 2 and 3 represent the 1st, 2nd and 3rd person forms of the verb. + + """ + self.checkpat(s1) + self.checkpat(s2) + self.checkpat(s3) + self.checkpatplural(p1) + self.checkpatplural(p2) + self.checkpatplural(p3) + self.pl_v_user_defined.extend((s1, p1, s2, p2, s3, p3)) + return 1 + + def defadj(self, singular, plural): + """ + Set the adjective plural of singular to plural. + + """ + self.checkpat(singular) + self.checkpatplural(plural) + self.pl_adj_user_defined.extend((singular, plural)) + return 1 + + def defa(self, pattern): + """ + Define the indefinate article as 'a' for words matching pattern. + + """ + self.checkpat(pattern) + self.A_a_user_defined.extend((pattern, "a")) + return 1 + + def defan(self, pattern): + """ + Define the indefinate article as 'an' for words matching pattern. + + """ + self.checkpat(pattern) + self.A_a_user_defined.extend((pattern, "an")) + return 1 + + def checkpat(self, pattern): + """ + check for errors in a regex pattern + """ + if pattern is None: + return + try: + re.match(pattern, "") + except re.error: + print3("\nBad user-defined singular pattern:\n\t%s\n" % pattern) + raise BadUserDefinedPatternError + + def checkpatplural(self, pattern): + """ + check for errors in a regex replace pattern + """ + return + + def ud_match(self, word, wordlist): + for i in range(len(wordlist) - 2, -2, -2): # backwards through even elements + mo = re.search(r"^%s$" % wordlist[i], word, re.IGNORECASE) + if mo: + if wordlist[i + 1] is None: + return None + pl = re.sub( + r"\$(\d+)", r"\\1", wordlist[i + 1] + ) # change $n to \n for expand + return mo.expand(pl) + return None + + def classical(self, **kwargs): + """ + turn classical mode on and off for various categories + + turn on all classical modes: + classical() + classical(all=True) + + turn on or off specific claassical modes: + e.g. + classical(herd=True) + classical(names=False) + + By default all classical modes are off except names. + + unknown value in args or key in kwargs rasies + exception: UnknownClasicalModeError + + """ + classical_mode = list(def_classical.keys()) + if not kwargs: + self.classical_dict = all_classical.copy() + return + if "all" in kwargs: + if kwargs["all"]: + self.classical_dict = all_classical.copy() + else: + self.classical_dict = no_classical.copy() + + for k, v in list(kwargs.items()): + if k in classical_mode: + self.classical_dict[k] = v + else: + raise UnknownClassicalModeError + + def num(self, count=None, show=None): # (;$count,$show) + """ + Set the number to be used in other method calls. + + Returns count. + + Set show to False to return '' instead. + + """ + if count is not None: + try: + self.persistent_count = int(count) + except ValueError: + raise BadNumValueError + if (show is None) or show: + return str(count) + else: + self.persistent_count = None + return "" + + def gender(self, gender): + """ + set the gender for the singular of plural pronouns + + can be one of: + 'neuter' ('they' -> 'it') + 'feminine' ('they' -> 'she') + 'masculine' ('they' -> 'he') + 'gender-neutral' ('they' -> 'they') + 'feminine or masculine' ('they' -> 'she or he') + 'masculine or feminine' ('they' -> 'he or she') + """ + if gender in singular_pronoun_genders: + self.thegender = gender + else: + raise BadGenderError + + def _get_value_from_ast(self, obj): + """ + Return the value of the ast object. + """ + if isinstance(obj, ast.Num): + return obj.n + elif isinstance(obj, ast.Str): + return obj.s + elif isinstance(obj, ast.List): + return [self._get_value_from_ast(e) for e in obj.elts] + elif isinstance(obj, ast.Tuple): + return tuple([self._get_value_from_ast(e) for e in obj.elts]) + + # None, True and False are NameConstants in Py3.4 and above. + elif sys.version_info.major >= 3 and isinstance(obj, ast.NameConstant): + return obj.value + + # For python versions below 3.4 + elif isinstance(obj, ast.Name) and (obj.id in ["True", "False", "None"]): + return string_to_constant[obj.id] + + # Probably passed a variable name. + # Or passed a single word without wrapping it in quotes as an argument + # ex: p.inflect("I plural(see)") instead of p.inflect("I plural('see')") + raise NameError("name '%s' is not defined" % obj.id) + + def _string_to_substitute(self, mo, methods_dict): + """ + Return the string to be substituted for the match. + """ + matched_text, f_name = mo.groups() + # matched_text is the complete match string. e.g. plural_noun(cat) + # f_name is the function name. e.g. plural_noun + + # Return matched_text if function name is not in methods_dict + if f_name not in methods_dict: + return matched_text + + # Parse the matched text + a_tree = ast.parse(matched_text) + + # get the args and kwargs from ast objects + args_list = [self._get_value_from_ast(a) for a in a_tree.body[0].value.args] + kwargs_list = { + kw.arg: self._get_value_from_ast(kw.value) + for kw in a_tree.body[0].value.keywords + } + + # Call the corresponding function + return methods_dict[f_name](*args_list, **kwargs_list) + + # 0. PERFORM GENERAL INFLECTIONS IN A STRING + + def inflect(self, text): + """ + Perform inflections in a string. + + e.g. inflect('The plural of cat is plural(cat)') returns + 'The plural of cat is cats' + + can use plural, plural_noun, plural_verb, plural_adj, + singular_noun, a, an, no, ordinal, number_to_words, + and prespart + + """ + save_persistent_count = self.persistent_count + + # Dictionary of allowed methods + methods_dict = { + "plural": self.plural, + "plural_adj": self.plural_adj, + "plural_noun": self.plural_noun, + "plural_verb": self.plural_verb, + "singular_noun": self.singular_noun, + "a": self.a, + "an": self.a, + "no": self.no, + "ordinal": self.ordinal, + "number_to_words": self.number_to_words, + "present_participle": self.present_participle, + "num": self.num, + } + + # Regular expression to find Python's function call syntax + functions_re = re.compile(r"((\w+)\([^)]*\)*)", re.IGNORECASE) + output = functions_re.sub( + lambda mo: self._string_to_substitute(mo, methods_dict), text + ) + self.persistent_count = save_persistent_count + return output + + # ## PLURAL SUBROUTINES + + def postprocess(self, orig, inflected): + if "|" in inflected: + inflected = inflected.split("|")[self.classical_dict["all"]] + result = inflected.split(" ") + # Try to fix word wise capitalization + for index, word in enumerate(orig.split(" ")): + if word == "I": + # Is this the only word for exceptions like this + # Where the original is fully capitalized + # without 'meaning' capitalization? + # Also this fails to handle a capitalizaion in context + continue + if word.capitalize() == word: + result[index] = result[index].capitalize() + if word == word.upper(): + result[index] = result[index].upper() + return " ".join(result) + + def partition_word(self, text): + mo = re.search(r"\A(\s*)(.+?)(\s*)\Z", text) + try: + return mo.group(1), mo.group(2), mo.group(3) + except AttributeError: # empty string + return "", "", "" + + def plural(self, text, count=None): + """ + Return the plural of text. + + If count supplied, then return text if count is one of: + 1, a, an, one, each, every, this, that + otherwise return the plural. + + Whitespace at the start and end is preserved. + + """ + pre, word, post = self.partition_word(text) + if not word: + return text + plural = self.postprocess( + word, + self._pl_special_adjective(word, count) + or self._pl_special_verb(word, count) + or self._plnoun(word, count), + ) + return "{}{}{}".format(pre, plural, post) + + def plural_noun(self, text, count=None): + """ + Return the plural of text, where text is a noun. + + If count supplied, then return text if count is one of: + 1, a, an, one, each, every, this, that + otherwise return the plural. + + Whitespace at the start and end is preserved. + + """ + pre, word, post = self.partition_word(text) + if not word: + return text + plural = self.postprocess(word, self._plnoun(word, count)) + return "{}{}{}".format(pre, plural, post) + + def plural_verb(self, text, count=None): + """ + Return the plural of text, where text is a verb. + + If count supplied, then return text if count is one of: + 1, a, an, one, each, every, this, that + otherwise return the plural. + + Whitespace at the start and end is preserved. + + """ + pre, word, post = self.partition_word(text) + if not word: + return text + plural = self.postprocess( + word, + self._pl_special_verb(word, count) or self._pl_general_verb(word, count), + ) + return "{}{}{}".format(pre, plural, post) + + def plural_adj(self, text, count=None): + """ + Return the plural of text, where text is an adjective. + + If count supplied, then return text if count is one of: + 1, a, an, one, each, every, this, that + otherwise return the plural. + + Whitespace at the start and end is preserved. + + """ + pre, word, post = self.partition_word(text) + if not word: + return text + plural = self.postprocess(word, self._pl_special_adjective(word, count) or word) + return "{}{}{}".format(pre, plural, post) + + def compare(self, word1, word2): + """ + compare word1 and word2 for equality regardless of plurality + + return values: + eq - the strings are equal + p:s - word1 is the plural of word2 + s:p - word2 is the plural of word1 + p:p - word1 and word2 are two different plural forms of the one word + False - otherwise + + """ + return ( + self._plequal(word1, word2, self.plural_noun) + or self._plequal(word1, word2, self.plural_verb) + or self._plequal(word1, word2, self.plural_adj) + ) + + def compare_nouns(self, word1, word2): + """ + compare word1 and word2 for equality regardless of plurality + word1 and word2 are to be treated as nouns + + return values: + eq - the strings are equal + p:s - word1 is the plural of word2 + s:p - word2 is the plural of word1 + p:p - word1 and word2 are two different plural forms of the one word + False - otherwise + + """ + return self._plequal(word1, word2, self.plural_noun) + + def compare_verbs(self, word1, word2): + """ + compare word1 and word2 for equality regardless of plurality + word1 and word2 are to be treated as verbs + + return values: + eq - the strings are equal + p:s - word1 is the plural of word2 + s:p - word2 is the plural of word1 + p:p - word1 and word2 are two different plural forms of the one word + False - otherwise + + """ + return self._plequal(word1, word2, self.plural_verb) + + def compare_adjs(self, word1, word2): + """ + compare word1 and word2 for equality regardless of plurality + word1 and word2 are to be treated as adjectives + + return values: + eq - the strings are equal + p:s - word1 is the plural of word2 + s:p - word2 is the plural of word1 + p:p - word1 and word2 are two different plural forms of the one word + False - otherwise + + """ + return self._plequal(word1, word2, self.plural_adj) + + def singular_noun(self, text, count=None, gender=None): + """ + Return the singular of text, where text is a plural noun. + + If count supplied, then return the singular if count is one of: + 1, a, an, one, each, every, this, that or if count is None + otherwise return text unchanged. + + Whitespace at the start and end is preserved. + + """ + pre, word, post = self.partition_word(text) + if not word: + return text + sing = self._sinoun(word, count=count, gender=gender) + if sing is not False: + plural = self.postprocess( + word, self._sinoun(word, count=count, gender=gender) + ) + return "{}{}{}".format(pre, plural, post) + return False + + def _plequal(self, word1, word2, pl): + classval = self.classical_dict.copy() + self.classical_dict = all_classical.copy() + if word1 == word2: + return "eq" + if word1 == pl(word2): + return "p:s" + if pl(word1) == word2: + return "s:p" + self.classical_dict = no_classical.copy() + if word1 == pl(word2): + return "p:s" + if pl(word1) == word2: + return "s:p" + self.classical_dict = classval.copy() + + if pl == self.plural or pl == self.plural_noun: + if self._pl_check_plurals_N(word1, word2): + return "p:p" + if self._pl_check_plurals_N(word2, word1): + return "p:p" + if pl == self.plural or pl == self.plural_adj: + if self._pl_check_plurals_adj(word1, word2): + return "p:p" + return False + + def _pl_reg_plurals(self, pair, stems, end1, end2): + pattern = r"({})({}\|\1{}|{}\|\1{})".format(stems, end1, end2, end2, end1) + return bool(re.search(pattern, pair)) + + def _pl_check_plurals_N(self, word1, word2): + stem_endings = ( + (pl_sb_C_a_ata, "as", "ata"), + (pl_sb_C_is_ides, "is", "ides"), + (pl_sb_C_a_ae, "s", "e"), + (pl_sb_C_en_ina, "ens", "ina"), + (pl_sb_C_um_a, "ums", "a"), + (pl_sb_C_us_i, "uses", "i"), + (pl_sb_C_on_a, "ons", "a"), + (pl_sb_C_o_i_stems, "os", "i"), + (pl_sb_C_ex_ices, "exes", "ices"), + (pl_sb_C_ix_ices, "ixes", "ices"), + (pl_sb_C_i, "s", "i"), + (pl_sb_C_im, "s", "im"), + (".*eau", "s", "x"), + (".*ieu", "s", "x"), + (".*tri", "xes", "ces"), + (".{2,}[yia]n", "xes", "ges"), + ) + pair = "{}|{}".format(word1, word2) + + return ( + pair in pl_sb_irregular_s.values() + or pair in pl_sb_irregular.values() + or pair in pl_sb_irregular_caps.values() + or any( + self._pl_reg_plurals(pair, stems, end1, end2) + for stems, end1, end2 in stem_endings + ) + ) + + def _pl_check_plurals_adj(self, word1, word2): + word1a = word1[: word1.rfind("'")] if word1.endswith(("'s", "'")) else "" + word2a = word2[: word2.rfind("'")] if word2.endswith(("'s", "'")) else "" + + return ( + word1a + and word2a + and ( + self._pl_check_plurals_N(word1a, word2a) + or self._pl_check_plurals_N(word2a, word1a) + ) + ) + + def get_count(self, count=None): + if count is None and self.persistent_count is not None: + count = self.persistent_count + + if count is not None: + count = ( + 1 + if ( + (str(count) in pl_count_one) + or ( + self.classical_dict["zero"] + and str(count).lower() in pl_count_zero + ) + ) + else 2 + ) + else: + count = "" + return count + + # @profile + def _plnoun(self, word, count=None): + count = self.get_count(count) + + # DEFAULT TO PLURAL + + if count == 1: + return word + + # HANDLE USER-DEFINED NOUNS + + value = self.ud_match(word, self.pl_sb_user_defined) + if value is not None: + return value + + # HANDLE EMPTY WORD, SINGULAR COUNT AND UNINFLECTED PLURALS + + if word == "": + return word + + lowerword = word.lower() + + if lowerword in pl_sb_uninflected_complete: + return word + + if word in pl_sb_uninflected_caps: + return word + + for k, v in pl_sb_uninflected_bysize.items(): + if lowerword[-k:] in v: + return word + + if self.classical_dict["herd"] and lowerword in pl_sb_uninflected_herd: + return word + + # HANDLE COMPOUNDS ("Governor General", "mother-in-law", "aide-de-camp", ETC.) + + mo = re.search(r"^(?:%s)$" % pl_sb_postfix_adj_stems, word, re.IGNORECASE) + if mo and mo.group(2) != "": + return "{}{}".format(self._plnoun(mo.group(1), 2), mo.group(2)) + + if " a " in lowerword or "-a-" in lowerword: + mo = re.search(r"^(?:%s)$" % pl_sb_prep_dual_compound, word, re.IGNORECASE) + if mo and mo.group(2) != "" and mo.group(3) != "": + return "{}{}{}".format( + self._plnoun(mo.group(1), 2), mo.group(2), self._plnoun(mo.group(3)) + ) + + lowersplit = lowerword.split(" ") + if len(lowersplit) >= 3: + for numword in range(1, len(lowersplit) - 1): + if lowersplit[numword] in pl_prep_list_da: + return " ".join( + lowersplit[: numword - 1] + + [self._plnoun(lowersplit[numword - 1], 2)] + + lowersplit[numword:] + ) + + # only pluralize denominators in units + mo = re.search( + r"(?P.+)( (%s) .+)" % "|".join(["per", "a"]), lowerword + ) + if mo: + index = len(mo.group("denominator")) + return "{}{}".format(self._plnoun(word[:index]), word[index:]) + + # handle units given in degrees (only accept if + # there is no more than one word following) + # degree Celsius => degrees Celsius but degree + # fahrenheit hour => degree fahrenheit hours + if len(lowersplit) >= 2 and lowersplit[-2] in ["degree"]: + return " ".join([self._plnoun(lowersplit[0])] + lowersplit[1:]) + + lowersplit = lowerword.split("-") + if len(lowersplit) >= 3: + for numword in range(1, len(lowersplit) - 1): + if lowersplit[numword] in pl_prep_list_da: + return " ".join( + lowersplit[: numword - 1] + + [ + self._plnoun(lowersplit[numword - 1], 2) + + "-" + + lowersplit[numword] + + "-" + ] + ) + " ".join(lowersplit[(numword + 1) :]) + + # HANDLE PRONOUNS + + for k, v in pl_pron_acc_keys_bysize.items(): + if lowerword[-k:] in v: # ends with accusivate pronoun + for pk, pv in pl_prep_bysize.items(): + if lowerword[:pk] in pv: # starts with a prep + if lowerword.split() == [lowerword[:pk], lowerword[-k:]]: + # only whitespace in between + return lowerword[:-k] + pl_pron_acc[lowerword[-k:]] + + try: + return pl_pron_nom[word.lower()] + except KeyError: + pass + + try: + return pl_pron_acc[word.lower()] + except KeyError: + pass + + # HANDLE ISOLATED IRREGULAR PLURALS + + wordsplit = word.split() + wordlast = wordsplit[-1] + lowerwordlast = wordlast.lower() + + if wordlast in list(pl_sb_irregular_caps.keys()): + llen = len(wordlast) + return "{}{}".format(word[:-llen], pl_sb_irregular_caps[wordlast]) + + if lowerwordlast in list(pl_sb_irregular.keys()): + llen = len(lowerwordlast) + return "{}{}".format(word[:-llen], pl_sb_irregular[lowerwordlast]) + + if (" ".join(wordsplit[-2:])).lower() in list(pl_sb_irregular_compound.keys()): + llen = len( + " ".join(wordsplit[-2:]) + ) # TODO: what if 2 spaces between these words? + return "{}{}".format( + word[:-llen], + pl_sb_irregular_compound[(" ".join(wordsplit[-2:])).lower()], + ) + + if lowerword[-3:] == "quy": + return word[:-1] + "ies" + + if lowerword[-6:] == "person": + if self.classical_dict["persons"]: + return word + "s" + else: + return word[:-4] + "ople" + + # HANDLE FAMILIES OF IRREGULAR PLURALS + + if lowerword[-3:] == "man": + for k, v in pl_sb_U_man_mans_bysize.items(): + if lowerword[-k:] in v: + return word + "s" + for k, v in pl_sb_U_man_mans_caps_bysize.items(): + if word[-k:] in v: + return word + "s" + return word[:-3] + "men" + if lowerword[-5:] == "mouse": + return word[:-5] + "mice" + if lowerword[-5:] == "louse": + return word[:-5] + "lice" + if lowerword[-5:] == "goose": + return word[:-5] + "geese" + if lowerword[-5:] == "tooth": + return word[:-5] + "teeth" + if lowerword[-4:] == "foot": + return word[:-4] + "feet" + if lowerword[-4:] == "taco": + return word[:-5] + "tacos" + + if lowerword == "die": + return "dice" + + # HANDLE UNASSIMILATED IMPORTS + + if lowerword[-4:] == "ceps": + return word + if lowerword[-4:] == "zoon": + return word[:-2] + "a" + if lowerword[-3:] in ("cis", "sis", "xis"): + return word[:-2] + "es" + + for lastlet, d, numend, post in ( + ("h", pl_sb_U_ch_chs_bysize, None, "s"), + ("x", pl_sb_U_ex_ices_bysize, -2, "ices"), + ("x", pl_sb_U_ix_ices_bysize, -2, "ices"), + ("m", pl_sb_U_um_a_bysize, -2, "a"), + ("s", pl_sb_U_us_i_bysize, -2, "i"), + ("n", pl_sb_U_on_a_bysize, -2, "a"), + ("a", pl_sb_U_a_ae_bysize, None, "e"), + ): + if lowerword[-1] == lastlet: # this test to add speed + for k, v in d.items(): + if lowerword[-k:] in v: + return word[:numend] + post + + # HANDLE INCOMPLETELY ASSIMILATED IMPORTS + + if self.classical_dict["ancient"]: + if lowerword[-4:] == "trix": + return word[:-1] + "ces" + if lowerword[-3:] in ("eau", "ieu"): + return word + "x" + if lowerword[-3:] in ("ynx", "inx", "anx") and len(word) > 4: + return word[:-1] + "ges" + + for lastlet, d, numend, post in ( + ("n", pl_sb_C_en_ina_bysize, -2, "ina"), + ("x", pl_sb_C_ex_ices_bysize, -2, "ices"), + ("x", pl_sb_C_ix_ices_bysize, -2, "ices"), + ("m", pl_sb_C_um_a_bysize, -2, "a"), + ("s", pl_sb_C_us_i_bysize, -2, "i"), + ("s", pl_sb_C_us_us_bysize, None, ""), + ("a", pl_sb_C_a_ae_bysize, None, "e"), + ("a", pl_sb_C_a_ata_bysize, None, "ta"), + ("s", pl_sb_C_is_ides_bysize, -1, "des"), + ("o", pl_sb_C_o_i_bysize, -1, "i"), + ("n", pl_sb_C_on_a_bysize, -2, "a"), + ): + if lowerword[-1] == lastlet: # this test to add speed + for k, v in d.items(): + if lowerword[-k:] in v: + return word[:numend] + post + + for d, numend, post in ( + (pl_sb_C_i_bysize, None, "i"), + (pl_sb_C_im_bysize, None, "im"), + ): + for k, v in d.items(): + if lowerword[-k:] in v: + return word[:numend] + post + + # HANDLE SINGULAR NOUNS ENDING IN ...s OR OTHER SILIBANTS + + if lowerword in pl_sb_singular_s_complete: + return word + "es" + + for k, v in pl_sb_singular_s_bysize.items(): + if lowerword[-k:] in v: + return word + "es" + + if lowerword[-2:] == "es" and word[0] == word[0].upper(): + return word + "es" + + if lowerword[-1] == "z": + for k, v in pl_sb_z_zes_bysize.items(): + if lowerword[-k:] in v: + return word + "es" + + if lowerword[-2:-1] != "z": + return word + "zes" + + if lowerword[-2:] == "ze": + for k, v in pl_sb_ze_zes_bysize.items(): + if lowerword[-k:] in v: + return word + "s" + + if lowerword[-2:] in ("ch", "sh", "zz", "ss") or lowerword[-1] == "x": + return word + "es" + + # HANDLE ...f -> ...ves + + if lowerword[-3:] in ("elf", "alf", "olf"): + return word[:-1] + "ves" + if lowerword[-3:] == "eaf" and lowerword[-4:-3] != "d": + return word[:-1] + "ves" + if lowerword[-4:] in ("nife", "life", "wife"): + return word[:-2] + "ves" + if lowerword[-3:] == "arf": + return word[:-1] + "ves" + + # HANDLE ...y + + if lowerword[-1] == "y": + if lowerword[-2:-1] in "aeiou" or len(word) == 1: + return word + "s" + + if self.classical_dict["names"]: + if lowerword[-1] == "y" and word[0] == word[0].upper(): + return word + "s" + + return word[:-1] + "ies" + + # HANDLE ...o + + if lowerword in pl_sb_U_o_os_complete: + return word + "s" + + for k, v in pl_sb_U_o_os_bysize.items(): + if lowerword[-k:] in v: + return word + "s" + + if lowerword[-2:] in ("ao", "eo", "io", "oo", "uo"): + return word + "s" + + if lowerword[-1] == "o": + return word + "es" + + # OTHERWISE JUST ADD ...s + + return "%ss" % word + + def _pl_special_verb(self, word, count=None): + if self.classical_dict["zero"] and str(count).lower() in pl_count_zero: + return False + count = self.get_count(count) + + if count == 1: + return word + + # HANDLE USER-DEFINED VERBS + + value = self.ud_match(word, self.pl_v_user_defined) + if value is not None: + return value + + # HANDLE IRREGULAR PRESENT TENSE (SIMPLE AND COMPOUND) + + lowerword = word.lower() + try: + firstword = lowerword.split()[0] + except IndexError: + return False # word is '' + + if firstword in list(plverb_irregular_pres.keys()): + return "{}{}".format( + plverb_irregular_pres[firstword], word[len(firstword) :] + ) + + # HANDLE IRREGULAR FUTURE, PRETERITE AND PERFECT TENSES + + if firstword in plverb_irregular_non_pres: + return word + + # HANDLE PRESENT NEGATIONS (SIMPLE AND COMPOUND) + + if firstword.endswith("n't") and firstword[:-3] in list( + plverb_irregular_pres.keys() + ): + return "{}n't{}".format( + plverb_irregular_pres[firstword[:-3]], word[len(firstword) :] + ) + + if firstword.endswith("n't"): + return word + + # HANDLE SPECIAL CASES + + mo = re.search(r"^(%s)$" % plverb_special_s, word) + if mo: + return False + if re.search(r"\s", word): + return False + if lowerword == "quizzes": + return "quiz" + + # HANDLE STANDARD 3RD PERSON (CHOP THE ...(e)s OFF SINGLE WORDS) + + if ( + lowerword[-4:] in ("ches", "shes", "zzes", "sses") + or lowerword[-3:] == "xes" + ): + return word[:-2] + + if lowerword[-3:] == "ies" and len(word) > 3: + return lowerword[:-3] + "y" + + if ( + lowerword in pl_v_oes_oe + or lowerword[-4:] in pl_v_oes_oe_endings_size4 + or lowerword[-5:] in pl_v_oes_oe_endings_size5 + ): + return word[:-1] + + if lowerword.endswith("oes") and len(word) > 3: + return lowerword[:-2] + + mo = re.search(r"^(.*[^s])s$", word, re.IGNORECASE) + if mo: + return mo.group(1) + + # OTHERWISE, A REGULAR VERB (HANDLE ELSEWHERE) + + return False + + def _pl_general_verb(self, word, count=None): + count = self.get_count(count) + + if count == 1: + return word + + # HANDLE AMBIGUOUS PRESENT TENSES (SIMPLE AND COMPOUND) + + mo = re.search( + r"^(%s)((\s.*)?)$" % plverb_ambiguous_pres_keys, word, re.IGNORECASE + ) + if mo: + return "{}{}".format( + plverb_ambiguous_pres[mo.group(1).lower()], mo.group(2) + ) + + # HANDLE AMBIGUOUS PRETERITE AND PERFECT TENSES + + mo = re.search( + r"^(%s)((\s.*)?)$" % plverb_ambiguous_non_pres, word, re.IGNORECASE + ) + if mo: + return word + + # OTHERWISE, 1st OR 2ND PERSON IS UNINFLECTED + + return word + + def _pl_special_adjective(self, word, count=None): + count = self.get_count(count) + + if count == 1: + return word + + # HANDLE USER-DEFINED ADJECTIVES + + value = self.ud_match(word, self.pl_adj_user_defined) + if value is not None: + return value + + # HANDLE KNOWN CASES + + mo = re.search(r"^(%s)$" % pl_adj_special_keys, word, re.IGNORECASE) + if mo: + return "%s" % (pl_adj_special[mo.group(1).lower()]) + + # HANDLE POSSESSIVES + + mo = re.search(r"^(%s)$" % pl_adj_poss_keys, word, re.IGNORECASE) + if mo: + return "%s" % (pl_adj_poss[mo.group(1).lower()]) + + mo = re.search(r"^(.*)'s?$", word) + if mo: + pl = self.plural_noun(mo.group(1)) + trailing_s = "" if pl[-1] == "s" else "s" + return "{}'{}".format(pl, trailing_s) + + # OTHERWISE, NO IDEA + + return False + + # @profile + def _sinoun(self, word, count=None, gender=None): + count = self.get_count(count) + + # DEFAULT TO PLURAL + + if count == 2: + return word + + # SET THE GENDER + + try: + if gender is None: + gender = self.thegender + elif gender not in singular_pronoun_genders: + raise BadGenderError + except (TypeError, IndexError): + raise BadGenderError + + # HANDLE USER-DEFINED NOUNS + + value = self.ud_match(word, self.si_sb_user_defined) + if value is not None: + return value + + # HANDLE EMPTY WORD, SINGULAR COUNT AND UNINFLECTED PLURALS + + if word == "": + return word + + lowerword = word.lower() + + if word in si_sb_ois_oi_case: + return word[:-1] + + if lowerword in pl_sb_uninflected_complete: + return word + + if word in pl_sb_uninflected_caps: + return word + + for k, v in pl_sb_uninflected_bysize.items(): + if lowerword[-k:] in v: + return word + + if self.classical_dict["herd"] and lowerword in pl_sb_uninflected_herd: + return word + + if lowerword in pl_sb_C_us_us: + return word + + # HANDLE COMPOUNDS ("Governor General", "mother-in-law", "aide-de-camp", ETC.) + + mo = re.search(r"^(?:%s)$" % pl_sb_postfix_adj_stems, word, re.IGNORECASE) + if mo and mo.group(2) != "": + return "{}{}".format( + self._sinoun(mo.group(1), 1, gender=gender), mo.group(2) + ) + + lowersplit = lowerword.split(" ") + if len(lowersplit) >= 3: + for numword in range(1, len(lowersplit) - 1): + if lowersplit[numword] in pl_prep_list_da: + return " ".join( + lowersplit[: numword - 1] + + [ + self._sinoun(lowersplit[numword - 1], 1, gender=gender) + or lowersplit[numword - 1] + ] + + lowersplit[numword:] + ) + + lowersplit = lowerword.split("-") + if len(lowersplit) >= 3: + for numword in range(1, len(lowersplit) - 1): + if lowersplit[numword] in pl_prep_list_da: + return " ".join( + lowersplit[: numword - 1] + + [ + ( + self._sinoun(lowersplit[numword - 1], 1, gender=gender) + or lowersplit[numword - 1] + ) + + "-" + + lowersplit[numword] + + "-" + ] + ) + " ".join(lowersplit[(numword + 1) :]) + + # HANDLE PRONOUNS + + for k, v in si_pron_acc_keys_bysize.items(): + if lowerword[-k:] in v: # ends with accusivate pronoun + for pk, pv in pl_prep_bysize.items(): + if lowerword[:pk] in pv: # starts with a prep + if lowerword.split() == [lowerword[:pk], lowerword[-k:]]: + # only whitespace in between + return lowerword[:-k] + get_si_pron( + "acc", lowerword[-k:], gender + ) + + try: + return get_si_pron("nom", word.lower(), gender) + except KeyError: + pass + + try: + return get_si_pron("acc", word.lower(), gender) + except KeyError: + pass + + # HANDLE ISOLATED IRREGULAR PLURALS + + wordsplit = word.split() + wordlast = wordsplit[-1] + lowerwordlast = wordlast.lower() + + if wordlast in list(si_sb_irregular_caps.keys()): + llen = len(wordlast) + return "{}{}".format(word[:-llen], si_sb_irregular_caps[wordlast]) + + if lowerwordlast in list(si_sb_irregular.keys()): + llen = len(lowerwordlast) + return "{}{}".format(word[:-llen], si_sb_irregular[lowerwordlast]) + + if (" ".join(wordsplit[-2:])).lower() in list(si_sb_irregular_compound.keys()): + llen = len( + " ".join(wordsplit[-2:]) + ) # TODO: what if 2 spaces between these words? + return "{}{}".format( + word[:-llen], + si_sb_irregular_compound[(" ".join(wordsplit[-2:])).lower()], + ) + + if lowerword[-5:] == "quies": + return word[:-3] + "y" + + if lowerword[-7:] == "persons": + return word[:-1] + if lowerword[-6:] == "people": + return word[:-4] + "rson" + + # HANDLE FAMILIES OF IRREGULAR PLURALS + + if lowerword[-4:] == "mans": + for k, v in si_sb_U_man_mans_bysize.items(): + if lowerword[-k:] in v: + return word[:-1] + for k, v in si_sb_U_man_mans_caps_bysize.items(): + if word[-k:] in v: + return word[:-1] + if lowerword[-3:] == "men": + return word[:-3] + "man" + if lowerword[-4:] == "mice": + return word[:-4] + "mouse" + if lowerword[-4:] == "lice": + return word[:-4] + "louse" + if lowerword[-5:] == "geese": + return word[:-5] + "goose" + if lowerword[-5:] == "teeth": + return word[:-5] + "tooth" + if lowerword[-4:] == "feet": + return word[:-4] + "foot" + + if lowerword == "dice": + return "die" + + # HANDLE UNASSIMILATED IMPORTS + + if lowerword[-4:] == "ceps": + return word + if lowerword[-3:] == "zoa": + return word[:-1] + "on" + + for lastlet, d, numend, post in ( + ("s", si_sb_U_ch_chs_bysize, -1, ""), + ("s", si_sb_U_ex_ices_bysize, -4, "ex"), + ("s", si_sb_U_ix_ices_bysize, -4, "ix"), + ("a", si_sb_U_um_a_bysize, -1, "um"), + ("i", si_sb_U_us_i_bysize, -1, "us"), + ("a", si_sb_U_on_a_bysize, -1, "on"), + ("e", si_sb_U_a_ae_bysize, -1, ""), + ): + if lowerword[-1] == lastlet: # this test to add speed + for k, v in d.items(): + if lowerword[-k:] in v: + return word[:numend] + post + + # HANDLE INCOMPLETELY ASSIMILATED IMPORTS + + if self.classical_dict["ancient"]: + + if lowerword[-6:] == "trices": + return word[:-3] + "x" + if lowerword[-4:] in ("eaux", "ieux"): + return word[:-1] + if lowerword[-5:] in ("ynges", "inges", "anges") and len(word) > 6: + return word[:-3] + "x" + + for lastlet, d, numend, post in ( + ("a", si_sb_C_en_ina_bysize, -3, "en"), + ("s", si_sb_C_ex_ices_bysize, -4, "ex"), + ("s", si_sb_C_ix_ices_bysize, -4, "ix"), + ("a", si_sb_C_um_a_bysize, -1, "um"), + ("i", si_sb_C_us_i_bysize, -1, "us"), + ("s", pl_sb_C_us_us_bysize, None, ""), + ("e", si_sb_C_a_ae_bysize, -1, ""), + ("a", si_sb_C_a_ata_bysize, -2, ""), + ("s", si_sb_C_is_ides_bysize, -3, "s"), + ("i", si_sb_C_o_i_bysize, -1, "o"), + ("a", si_sb_C_on_a_bysize, -1, "on"), + ("m", si_sb_C_im_bysize, -2, ""), + ("i", si_sb_C_i_bysize, -1, ""), + ): + if lowerword[-1] == lastlet: # this test to add speed + for k, v in d.items(): + if lowerword[-k:] in v: + return word[:numend] + post + + # HANDLE PLURLS ENDING IN uses -> use + + if ( + lowerword[-6:] == "houses" + or word in si_sb_uses_use_case + or lowerword in si_sb_uses_use + ): + return word[:-1] + + # HANDLE PLURLS ENDING IN ies -> ie + + if word in si_sb_ies_ie_case or lowerword in si_sb_ies_ie: + return word[:-1] + + # HANDLE PLURLS ENDING IN oes -> oe + + if ( + lowerword[-5:] == "shoes" + or word in si_sb_oes_oe_case + or lowerword in si_sb_oes_oe + ): + return word[:-1] + + # HANDLE SINGULAR NOUNS ENDING IN ...s OR OTHER SILIBANTS + + if word in si_sb_sses_sse_case or lowerword in si_sb_sses_sse: + return word[:-1] + + if lowerword in si_sb_singular_s_complete: + return word[:-2] + + for k, v in si_sb_singular_s_bysize.items(): + if lowerword[-k:] in v: + return word[:-2] + + if lowerword[-4:] == "eses" and word[0] == word[0].upper(): + return word[:-2] + + if lowerword in si_sb_z_zes: + return word[:-2] + + if lowerword in si_sb_zzes_zz: + return word[:-2] + + if lowerword[-4:] == "zzes": + return word[:-3] + + if word in si_sb_ches_che_case or lowerword in si_sb_ches_che: + return word[:-1] + + if lowerword[-4:] in ("ches", "shes"): + return word[:-2] + + if lowerword in si_sb_xes_xe: + return word[:-1] + + if lowerword[-3:] == "xes": + return word[:-2] + + # HANDLE ...f -> ...ves + + if word in si_sb_ves_ve_case or lowerword in si_sb_ves_ve: + return word[:-1] + + if lowerword[-3:] == "ves": + if lowerword[-5:-3] in ("el", "al", "ol"): + return word[:-3] + "f" + if lowerword[-5:-3] == "ea" and word[-6:-5] != "d": + return word[:-3] + "f" + if lowerword[-5:-3] in ("ni", "li", "wi"): + return word[:-3] + "fe" + if lowerword[-5:-3] == "ar": + return word[:-3] + "f" + + # HANDLE ...y + + if lowerword[-2:] == "ys": + if len(lowerword) > 2 and lowerword[-3] in "aeiou": + return word[:-1] + + if self.classical_dict["names"]: + if lowerword[-2:] == "ys" and word[0] == word[0].upper(): + return word[:-1] + + if lowerword[-3:] == "ies": + return word[:-3] + "y" + + # HANDLE ...o + + if lowerword[-2:] == "os": + + if lowerword in si_sb_U_o_os_complete: + return word[:-1] + + for k, v in si_sb_U_o_os_bysize.items(): + if lowerword[-k:] in v: + return word[:-1] + + if lowerword[-3:] in ("aos", "eos", "ios", "oos", "uos"): + return word[:-1] + + if lowerword[-3:] == "oes": + return word[:-2] + + # UNASSIMILATED IMPORTS FINAL RULE + + if word in si_sb_es_is: + return word[:-2] + "is" + + # OTHERWISE JUST REMOVE ...s + + if lowerword[-1] == "s": + return word[:-1] + + # COULD NOT FIND SINGULAR + + return False + + # ADJECTIVES + + def a(self, text, count=1): + """ + Return the appropriate indefinite article followed by text. + + The indefinite article is either 'a' or 'an'. + + If count is not one, then return count followed by text + instead of 'a' or 'an'. + + Whitespace at the start and end is preserved. + + """ + mo = re.search(r"\A(\s*)(?:an?\s+)?(.+?)(\s*)\Z", text, re.IGNORECASE) + if mo: + word = mo.group(2) + if not word: + return text + pre = mo.group(1) + post = mo.group(3) + result = self._indef_article(word, count) + return "{}{}{}".format(pre, result, post) + return "" + + an = a + + def _indef_article(self, word, count): + mycount = self.get_count(count) + + if mycount != 1: + return "{} {}".format(count, word) + + # HANDLE USER-DEFINED VARIANTS + + value = self.ud_match(word, self.A_a_user_defined) + if value is not None: + return "{} {}".format(value, word) + + # HANDLE ORDINAL FORMS + + for a in ((r"^(%s)" % A_ordinal_a, "a"), (r"^(%s)" % A_ordinal_an, "an")): + mo = re.search(a[0], word, re.IGNORECASE) + if mo: + return "{} {}".format(a[1], word) + + # HANDLE SPECIAL CASES + + for a in ( + (r"^(%s)" % A_explicit_an, "an"), + (r"^[aefhilmnorsx]$", "an"), + (r"^[bcdgjkpqtuvwyz]$", "a"), + ): + mo = re.search(a[0], word, re.IGNORECASE) + if mo: + return "{} {}".format(a[1], word) + + # HANDLE ABBREVIATIONS + + for a in ( + (r"(%s)" % A_abbrev, "an", re.VERBOSE), + (r"^[aefhilmnorsx][.-]", "an", re.IGNORECASE), + (r"^[a-z][.-]", "a", re.IGNORECASE), + ): + mo = re.search(a[0], word, a[2]) + if mo: + return "{} {}".format(a[1], word) + + # HANDLE CONSONANTS + + mo = re.search(r"^[^aeiouy]", word, re.IGNORECASE) + if mo: + return "a %s" % word + + # HANDLE SPECIAL VOWEL-FORMS + + for a in ( + (r"^e[uw]", "a"), + (r"^onc?e\b", "a"), + (r"^onetime\b", "a"), + (r"^uni([^nmd]|mo)", "a"), + (r"^u[bcfghjkqrst][aeiou]", "a"), + (r"^ukr", "a"), + (r"^(%s)" % A_explicit_a, "a"), + ): + mo = re.search(a[0], word, re.IGNORECASE) + if mo: + return "{} {}".format(a[1], word) + + # HANDLE SPECIAL CAPITALS + + mo = re.search(r"^U[NK][AIEO]?", word) + if mo: + return "a %s" % word + + # HANDLE VOWELS + + mo = re.search(r"^[aeiou]", word, re.IGNORECASE) + if mo: + return "an %s" % word + + # HANDLE y... (BEFORE CERTAIN CONSONANTS IMPLIES (UNNATURALIZED) "i.." SOUND) + + mo = re.search(r"^(%s)" % A_y_cons, word, re.IGNORECASE) + if mo: + return "an %s" % word + + # OTHERWISE, GUESS "a" + return "a %s" % word + + # 2. TRANSLATE ZERO-QUANTIFIED $word TO "no plural($word)" + + def no(self, text, count=None): + """ + If count is 0, no, zero or nil, return 'no' followed by the plural + of text. + + If count is one of: + 1, a, an, one, each, every, this, that + return count followed by text. + + Otherwise return count follow by the plural of text. + + In the return value count is always followed by a space. + + Whitespace at the start and end is preserved. + + """ + if count is None and self.persistent_count is not None: + count = self.persistent_count + + if count is None: + count = 0 + mo = re.search(r"\A(\s*)(.+?)(\s*)\Z", text) + pre = mo.group(1) + word = mo.group(2) + post = mo.group(3) + + if str(count).lower() in pl_count_zero: + return "{}no {}{}".format(pre, self.plural(word, 0), post) + else: + return "{}{} {}{}".format(pre, count, self.plural(word, count), post) + + # PARTICIPLES + + def present_participle(self, word): + """ + Return the present participle for word. + + word is the 3rd person singular verb. + + """ + plv = self.plural_verb(word, 2) + + for pat, repl in ( + (r"ie$", r"y"), + (r"ue$", r"u"), # TODO: isn't ue$ -> u encompassed in the following rule? + (r"([auy])e$", r"\g<1>"), + (r"ski$", r"ski"), + (r"[^b]i$", r""), + (r"^(are|were)$", r"be"), + (r"^(had)$", r"hav"), + (r"^(hoe)$", r"\g<1>"), + (r"([^e])e$", r"\g<1>"), + (r"er$", r"er"), + (r"([^aeiou][aeiouy]([bdgmnprst]))$", r"\g<1>\g<2>"), + ): + (ans, num) = re.subn(pat, repl, plv) + if num: + return "%sing" % ans + return "%sing" % ans + + # NUMERICAL INFLECTIONS + + def ordinal(self, num): + """ + Return the ordinal of num. + + num can be an integer or text + + e.g. ordinal(1) returns '1st' + ordinal('one') returns 'first' + + """ + if re.match(r"\d", str(num)): + try: + num % 2 + n = num + except TypeError: + if "." in str(num): + try: + # numbers after decimal, + # so only need last one for ordinal + n = int(num[-1]) + + except ValueError: # ends with '.', so need to use whole string + n = int(num[:-1]) + else: + n = int(num) + try: + post = nth[n % 100] + except KeyError: + post = nth[n % 10] + return "{}{}".format(num, post) + else: + mo = re.search(r"(%s)\Z" % ordinal_suff, num) + try: + post = ordinal[mo.group(1)] + return re.sub(r"(%s)\Z" % ordinal_suff, post, num) + except AttributeError: + return "%sth" % num + + def millfn(self, ind=0): + if ind > len(mill) - 1: + print3("number out of range") + raise NumOutOfRangeError + return mill[ind] + + def unitfn(self, units, mindex=0): + return "{}{}".format(unit[units], self.millfn(mindex)) + + def tenfn(self, tens, units, mindex=0): + if tens != 1: + return "{}{}{}{}".format( + ten[tens], + "-" if tens and units else "", + unit[units], + self.millfn(mindex), + ) + return "{}{}".format(teen[units], mill[mindex]) + + def hundfn(self, hundreds, tens, units, mindex): + if hundreds: + andword = " %s " % self.number_args["andword"] if tens or units else "" + return "{} hundred{}{}{}, ".format( + unit[hundreds], # use unit not unitfn as simpler + andword, + self.tenfn(tens, units), + self.millfn(mindex), + ) + if tens or units: + return "{}{}, ".format(self.tenfn(tens, units), self.millfn(mindex)) + return "" + + def group1sub(self, mo): + units = int(mo.group(1)) + if units == 1: + return " %s, " % self.number_args["one"] + elif units: + return "%s, " % unit[units] + else: + return " %s, " % self.number_args["zero"] + + def group1bsub(self, mo): + units = int(mo.group(1)) + if units: + return "%s, " % unit[units] + else: + return " %s, " % self.number_args["zero"] + + def group2sub(self, mo): + tens = int(mo.group(1)) + units = int(mo.group(2)) + if tens: + return "%s, " % self.tenfn(tens, units) + if units: + return " {} {}, ".format(self.number_args["zero"], unit[units]) + return " {} {}, ".format(self.number_args["zero"], self.number_args["zero"]) + + def group3sub(self, mo): + hundreds = int(mo.group(1)) + tens = int(mo.group(2)) + units = int(mo.group(3)) + if hundreds == 1: + hunword = " %s" % self.number_args["one"] + elif hundreds: + hunword = "%s" % unit[hundreds] + else: + hunword = " %s" % self.number_args["zero"] + if tens: + tenword = self.tenfn(tens, units) + elif units: + tenword = " {} {}".format(self.number_args["zero"], unit[units]) + else: + tenword = " {} {}".format( + self.number_args["zero"], self.number_args["zero"] + ) + return "{} {}, ".format(hunword, tenword) + + def hundsub(self, mo): + ret = self.hundfn( + int(mo.group(1)), int(mo.group(2)), int(mo.group(3)), self.mill_count + ) + self.mill_count += 1 + return ret + + def tensub(self, mo): + return "%s, " % self.tenfn(int(mo.group(1)), int(mo.group(2)), self.mill_count) + + def unitsub(self, mo): + return "%s, " % self.unitfn(int(mo.group(1)), self.mill_count) + + def enword(self, num, group): + # import pdb + # pdb.set_trace() + + if group == 1: + num = re.sub(r"(\d)", self.group1sub, num) + elif group == 2: + num = re.sub(r"(\d)(\d)", self.group2sub, num) + num = re.sub(r"(\d)", self.group1bsub, num, 1) + elif group == 3: + num = re.sub(r"(\d)(\d)(\d)", self.group3sub, num) + num = re.sub(r"(\d)(\d)", self.group2sub, num, 1) + num = re.sub(r"(\d)", self.group1sub, num, 1) + elif int(num) == 0: + num = self.number_args["zero"] + elif int(num) == 1: + num = self.number_args["one"] + else: + num = num.lstrip().lstrip("0") + self.mill_count = 0 + # surely there's a better way to do the next bit + mo = re.search(r"(\d)(\d)(\d)(?=\D*\Z)", num) + while mo: + num = re.sub(r"(\d)(\d)(\d)(?=\D*\Z)", self.hundsub, num, 1) + mo = re.search(r"(\d)(\d)(\d)(?=\D*\Z)", num) + num = re.sub(r"(\d)(\d)(?=\D*\Z)", self.tensub, num, 1) + num = re.sub(r"(\d)(?=\D*\Z)", self.unitsub, num, 1) + return num + + def blankfn(self, mo): + """ do a global blank replace + TODO: surely this can be done with an option to re.sub + rather than this fn + """ + return "" + + def commafn(self, mo): + """ do a global ',' replace + TODO: surely this can be done with an option to re.sub + rather than this fn + """ + return "," + + def spacefn(self, mo): + """ do a global ' ' replace + TODO: surely this can be done with an option to re.sub + rather than this fn + """ + return " " + + def number_to_words( + self, + num, + wantlist=False, + group=0, + comma=",", + andword="and", + zero="zero", + one="one", + decimal="point", + threshold=None, + ): + """ + Return a number in words. + + group = 1, 2 or 3 to group numbers before turning into words + comma: define comma + andword: word for 'and'. Can be set to ''. + e.g. "one hundred and one" vs "one hundred one" + zero: word for '0' + one: word for '1' + decimal: word for decimal point + threshold: numbers above threshold not turned into words + + parameters not remembered from last call. Departure from Perl version. + """ + self.number_args = dict(andword=andword, zero=zero, one=one) + num = "%s" % num + + # Handle "stylistic" conversions (up to a given threshold)... + if threshold is not None and float(num) > threshold: + spnum = num.split(".", 1) + while comma: + (spnum[0], n) = re.subn(r"(\d)(\d{3}(?:,|\Z))", r"\1,\2", spnum[0]) + if n == 0: + break + try: + return "{}.{}".format(spnum[0], spnum[1]) + except IndexError: + return "%s" % spnum[0] + + if group < 0 or group > 3: + raise BadChunkingOptionError + nowhite = num.lstrip() + if nowhite[0] == "+": + sign = "plus" + elif nowhite[0] == "-": + sign = "minus" + else: + sign = "" + + myord = num[-2:] in ("st", "nd", "rd", "th") + if myord: + num = num[:-2] + finalpoint = False + if decimal: + if group != 0: + chunks = num.split(".") + else: + chunks = num.split(".", 1) + if chunks[-1] == "": # remove blank string if nothing after decimal + chunks = chunks[:-1] + finalpoint = True # add 'point' to end of output + else: + chunks = [num] + + first = 1 + loopstart = 0 + + if chunks[0] == "": + first = 0 + if len(chunks) > 1: + loopstart = 1 + + for i in range(loopstart, len(chunks)): + chunk = chunks[i] + # remove all non numeric \D + chunk = re.sub(r"\D", self.blankfn, chunk) + if chunk == "": + chunk = "0" + + if group == 0 and (first == 0 or first == ""): + chunk = self.enword(chunk, 1) + else: + chunk = self.enword(chunk, group) + + if chunk[-2:] == ", ": + chunk = chunk[:-2] + chunk = re.sub(r"\s+,", self.commafn, chunk) + + if group == 0 and first: + chunk = re.sub(r", (\S+)\s+\Z", " %s \\1" % andword, chunk) + chunk = re.sub(r"\s+", self.spacefn, chunk) + # chunk = re.sub(r"(\A\s|\s\Z)", self.blankfn, chunk) + chunk = chunk.strip() + if first: + first = "" + chunks[i] = chunk + + numchunks = [] + if first != 0: + numchunks = chunks[0].split("%s " % comma) + + if myord and numchunks: + # TODO: can this be just one re as it is in perl? + mo = re.search(r"(%s)\Z" % ordinal_suff, numchunks[-1]) + if mo: + numchunks[-1] = re.sub( + r"(%s)\Z" % ordinal_suff, ordinal[mo.group(1)], numchunks[-1] + ) + else: + numchunks[-1] += "th" + + for chunk in chunks[1:]: + numchunks.append(decimal) + numchunks.extend(chunk.split("%s " % comma)) + + if finalpoint: + numchunks.append(decimal) + + # wantlist: Perl list context. can explictly specify in Python + if wantlist: + if sign: + numchunks = [sign] + numchunks + return numchunks + elif group: + signout = "%s " % sign if sign else "" + return "{}{}".format(signout, ", ".join(numchunks)) + else: + signout = "%s " % sign if sign else "" + num = "{}{}".format(signout, numchunks.pop(0)) + if decimal is None: + first = True + else: + first = not num.endswith(decimal) + for nc in numchunks: + if nc == decimal: + num += " %s" % nc + first = 0 + elif first: + num += "{} {}".format(comma, nc) + else: + num += " %s" % nc + return num + + # Join words with commas and a trailing 'and' (when appropriate)... + + def join( + self, + words, + sep=None, + sep_spaced=True, + final_sep=None, + conj="and", + conj_spaced=True, + ): + """ + Join words into a list. + + e.g. join(['ant', 'bee', 'fly']) returns 'ant, bee, and fly' + + options: + conj: replacement for 'and' + sep: separator. default ',', unless ',' is in the list then ';' + final_sep: final separator. default ',', unless ',' is in the list then ';' + conj_spaced: boolean. Should conj have spaces around it + + """ + if not words: + return "" + if len(words) == 1: + return words[0] + + if conj_spaced: + if conj == "": + conj = " " + else: + conj = " %s " % conj + + if len(words) == 2: + return "{}{}{}".format(words[0], conj, words[1]) + + if sep is None: + if "," in "".join(words): + sep = ";" + else: + sep = "," + if final_sep is None: + final_sep = sep + + final_sep = "{}{}".format(final_sep, conj) + + if sep_spaced: + sep += " " + + return "{}{}{}".format(sep.join(words[0:-1]), final_sep, words[-1]) diff --git a/libs/pysubs2/ssastyle.py b/libs/pysubs2/ssastyle.py index e43e1ff07..522f8ce0d 100644 --- a/libs/pysubs2/ssastyle.py +++ b/libs/pysubs2/ssastyle.py @@ -56,7 +56,7 @@ class SSAStyle(object): self.encoding = 1 #: Charset for k, v in fields.items(): - if k in self.FIELDS: + if k in self.FIELDS and v is not None: setattr(self, k, v) else: raise ValueError("SSAStyle has no field named %r" % k) diff --git a/libs/pysubs2/substation.py b/libs/pysubs2/substation.py index 0e5a1b707..fc4172a49 100644 --- a/libs/pysubs2/substation.py +++ b/libs/pysubs2/substation.py @@ -150,7 +150,14 @@ class SubstationFormat(FormatBase): if format_ == "ass": return ass_rgba_to_color(v) else: - return ssa_rgb_to_color(v) + try: + return ssa_rgb_to_color(v) + except ValueError: + try: + return ass_rgba_to_color(v) + except: + return Color(255, 255, 255, 0) + elif f in {"bold", "underline", "italic", "strikeout"}: return v == "-1" elif f in {"borderstyle", "encoding", "marginl", "marginr", "marginv", "layer", "alphalevel"}: diff --git a/libs/subliminal_patch/core.py b/libs/subliminal_patch/core.py index 3787b9fb3..9fe8587d9 100644 --- a/libs/subliminal_patch/core.py +++ b/libs/subliminal_patch/core.py @@ -493,7 +493,7 @@ def scan_video(path, dont_use_actual_file=False, hints=None, providers=None, ski raise ValueError('%r is not a valid video extension' % os.path.splitext(path)[1]) dirpath, filename = os.path.split(path) - logger.info('Scanning video %r in %r', filename, dirpath) + logger.info('Determining basic video properties for %r in %r', filename, dirpath) # hint guessit the filename itself and its 2 parent directories if we're an episode (most likely # Series name/Season/filename), else only one diff --git a/libs/subliminal_patch/providers/addic7ed.py b/libs/subliminal_patch/providers/addic7ed.py index 269dd6aa9..51913d887 100644 --- a/libs/subliminal_patch/providers/addic7ed.py +++ b/libs/subliminal_patch/providers/addic7ed.py @@ -84,32 +84,35 @@ class Addic7edProvider(_Addic7edProvider): # login if self.username and self.password: ccks = region.get("addic7ed_cookies", expiration_time=86400) - do_login = False if ccks != NO_VALUE: - self.session.cookies.update(ccks) - r = self.session.get(self.server_url + 'panel.php', allow_redirects=False, timeout=10) - if r.status_code == 302: - logger.info('Addic7ed: Login expired') - do_login = True - else: - logger.info('Addic7ed: Reusing old login') - self.logged_in = True - - if do_login: - logger.info('Addic7ed: Logging in') - data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'} - r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10) - - if "relax, slow down" in r.content: - raise TooManyRequests(self.username) - - if r.status_code != 302: - raise AuthenticationError(self.username) - - region.set("addic7ed_cookies", r.cookies) - - logger.debug('Addic7ed: Logged in') - self.logged_in = True + try: + self.session.cookies._cookies.update(ccks) + r = self.session.get(self.server_url + 'panel.php', allow_redirects=False, timeout=10) + if r.status_code == 302: + logger.info('Addic7ed: Login expired') + region.delete("addic7ed_cookies") + else: + logger.info('Addic7ed: Reusing old login') + self.logged_in = True + return + except: + pass + + logger.info('Addic7ed: Logging in') + data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'} + r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10, + headers={"Referer": self.server_url + "login.php"}) + + if "relax, slow down" in r.content: + raise TooManyRequests(self.username) + + if r.status_code != 302: + raise AuthenticationError(self.username) + + region.set("addic7ed_cookies", self.session.cookies._cookies) + + logger.debug('Addic7ed: Logged in') + self.logged_in = True @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) diff --git a/libs/subliminal_patch/providers/podnapisi.py b/libs/subliminal_patch/providers/podnapisi.py index 06a9d33ce..16b7c2d7e 100644 --- a/libs/subliminal_patch/providers/podnapisi.py +++ b/libs/subliminal_patch/providers/podnapisi.py @@ -18,7 +18,6 @@ except ImportError: import xml.etree.cElementTree as etree except ImportError: import xml.etree.ElementTree as etree - from babelfish import language_converters from subliminal import Episode from subliminal import Movie diff --git a/libs/subliminal_patch/providers/subscene.py b/libs/subliminal_patch/providers/subscene.py index 9f00975d6..38a97c579 100644 --- a/libs/subliminal_patch/providers/subscene.py +++ b/libs/subliminal_patch/providers/subscene.py @@ -4,6 +4,7 @@ import io import logging import os import time +import inflect from random import randint from zipfile import ZipFile @@ -20,6 +21,8 @@ from subliminal_patch.converters.subscene import language_ids, supported_languag from subscene_api.subscene import search, Subtitle as APISubtitle from subzero.language import Language +p = inflect.engine() + language_converters.register('subscene = subliminal_patch.converters.subscene:SubsceneConverter') logger = logging.getLogger(__name__) @@ -192,21 +195,27 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): def query(self, video): vfn = get_video_filename(video) + subtitles = [] logger.debug(u"Searching for: %s", vfn) film = search(vfn, session=self.session) - - subtitles = [] if film and film.subtitles: + logger.debug('Release results found: %s', len(film.subtitles)) subtitles = self.parse_results(video, film) + else: + logger.debug('No release results found') # re-search for episodes without explicit release name if isinstance(video, Episode): - term = u"%s S%02iE%02i" % (video.series, video.season, video.episode) + #term = u"%s S%02iE%02i" % (video.series, video.season, video.episode) + term = u"%s - %s Season" % (video.series, p.number_to_words("%sth" % video.season).capitalize()) time.sleep(self.search_throttle) logger.debug('Searching for alternative results: %s', term) - film = search(term, session=self.session) + film = search(term, session=self.session, release=False) if film and film.subtitles: + logger.debug('Alternative results found: %s', len(film.subtitles)) subtitles += self.parse_results(video, film) + else: + logger.debug('No alternative results found') # packs if video.season_fully_aired: @@ -215,9 +224,17 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): time.sleep(self.search_throttle) film = search(term, session=self.session) if film and film.subtitles: + logger.debug('Pack results found: %s', len(film.subtitles)) subtitles += self.parse_results(video, film) + else: + logger.debug('No pack results found') else: logger.debug("Not searching for packs, because the season hasn't fully aired") + else: + logger.debug('Searching for movie results: %s', video.title) + film = search(video.title, year=video.year, session=self.session, limit_to=None, release=False) + if film and film.subtitles: + subtitles += self.parse_results(video, film) logger.info("%s subtitles found" % len(subtitles)) return subtitles diff --git a/libs/subliminal_patch/providers/titlovi.py b/libs/subliminal_patch/providers/titlovi.py index 4cf6d124a..8e2a31c65 100644 --- a/libs/subliminal_patch/providers/titlovi.py +++ b/libs/subliminal_patch/providers/titlovi.py @@ -134,8 +134,8 @@ class TitloviProvider(Provider, ProviderSubtitleArchiveMixin): def initialize(self): self.session = Session() - self.session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \ - '(KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' + self.session.headers['User-Agent'] = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.3)' \ + 'Gecko/20100401 Firefox/3.6.3 ( .NET CLR 3.5.30729)' logger.debug('User-Agent set to %s', self.session.headers['User-Agent']) self.session.headers['Referer'] = self.server_url logger.debug('Referer set to %s', self.session.headers['Referer']) @@ -202,7 +202,7 @@ class TitloviProvider(Provider, ProviderSubtitleArchiveMixin): current_page = int(params['pg']) try: - sublist = soup.select('section.titlovi > ul.titlovi > li') + sublist = soup.select('section.titlovi > ul.titlovi > li.subtitleContainer.canEdit') for sub in sublist: # subtitle id sid = sub.find(attrs={'data-id': True}).attrs['data-id'] diff --git a/libs/subscene_api/subscene.py b/libs/subscene_api/subscene.py index b59783a4b..e57db3c44 100644 --- a/libs/subscene_api/subscene.py +++ b/libs/subscene_api/subscene.py @@ -25,6 +25,7 @@ this script that does the job by parsing the website"s pages. # imports import re + import enum import sys @@ -36,7 +37,7 @@ else: from contextlib import suppress from urllib2.request import Request, urlopen -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, NavigableString # constants HEADERS = { @@ -207,7 +208,7 @@ def section_exists(soup, section): return False -def get_first_film(soup, section, session=None): +def get_first_film(soup, section, year=None, session=None): tag_part = SectionsParts[section] tag = None @@ -220,12 +221,26 @@ def get_first_film(soup, section, session=None): if not tag: return - url = SITE_DOMAIN + tag.findNext("ul").find("li").div.a.get("href") + url = None + + if not year: + url = SITE_DOMAIN + tag.findNext("ul").find("li").div.a.get("href") + else: + for t in tag.findNext("ul").findAll("li"): + if isinstance(t, NavigableString) or not t.div: + continue + + if str(year) in t.div.a.string: + url = SITE_DOMAIN + t.div.a.get("href") + break + if not url: + return + return Film.from_url(url, session=session) -def search(term, session=None, limit_to=SearchTypes.Exact): - soup = soup_for("%s/subtitles/title?q=%s" % (SITE_DOMAIN, term), session=session) +def search(term, release=True, session=None, year=None, limit_to=SearchTypes.Exact): + soup = soup_for("%s/subtitles/%s?q=%s" % (SITE_DOMAIN, "release" if release else "title", term), session=session) if "Subtitle search by" in str(soup): rows = soup.find("table").tbody.find_all("tr") @@ -234,7 +249,7 @@ def search(term, session=None, limit_to=SearchTypes.Exact): for junk, search_type in SearchTypes.__members__.items(): if section_exists(soup, search_type): - return get_first_film(soup, search_type) + return get_first_film(soup, search_type, year=year, session=session) if limit_to == search_type: return diff --git a/libs/subzero/language.py b/libs/subzero/language.py index f520f6dad..8d096e19d 100644 --- a/libs/subzero/language.py +++ b/libs/subzero/language.py @@ -4,7 +4,6 @@ import types from babelfish.exceptions import LanguageError from babelfish import Language as Language_, basestr - repl_map = { "dk": "da", "nld": "nl", diff --git a/libs/subzero/modification/mods/common.py b/libs/subzero/modification/mods/common.py index b1d83c703..eba386b1d 100644 --- a/libs/subzero/modification/mods/common.py +++ b/libs/subzero/modification/mods/common.py @@ -28,13 +28,16 @@ class CommonFixes(SubtitleTextModification): NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1—", name="CM_multidash"), # line = _/-/\s - NReProcessor(re.compile(r'(?u)(^\W*[-_.:]+\W*$)'), "", name="CM_non_word_only"), + NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="CM_non_word_only"), + + # remove >> + NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"), # line = : text NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"), # fix music symbols - NReProcessor(re.compile(ur'(?u)(^[-\s]*[*#¶]+\s*)|(\s*[*#¶]+\s*$)'), + NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s*)|(\s*[*#¶]+\s*$)'), lambda x: u"♪ " if x.group(1) else u" ♪", name="CM_music_symbols"), @@ -85,9 +88,6 @@ class CommonFixes(SubtitleTextModification): # space before ending doublequote? - # remove >> - NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"), - # replace uppercase I with lowercase L in words NReProcessor(re.compile(ur'(?u)([a-zà-ž]+)(I+)'), lambda match: ur'%s%s' % (match.group(1), "l" * len(match.group(2))), diff --git a/libs/subzero/modification/mods/hearing_impaired.py b/libs/subzero/modification/mods/hearing_impaired.py index 10dade9ee..8912834d7 100644 --- a/libs/subzero/modification/mods/hearing_impaired.py +++ b/libs/subzero/modification/mods/hearing_impaired.py @@ -29,6 +29,22 @@ class HearingImpaired(SubtitleTextModification): FullBracketEntryProcessor(re.compile(ur'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}), "", name="HI_brackets_full"), + # uppercase text before colon (at least 3 uppercase chars); at start or after a sentence, + # possibly with a dash in front; ignore anything ending with a quote + NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])' + ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "", + name="HI_before_colon_caps"), + + # any text before colon (at least 3 chars); at start or after a sentence, + # possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if + # a space is inside the text; ignore anything ending with a quote + NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])' + ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'), + lambda match: + match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0) + else "" if not match.group(1).startswith(" ") else " ", + name="HI_before_colon_noncaps"), + # brackets (only remove if at least 3 chars in brackets) NReProcessor(re.compile(ur'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "", name="HI_brackets"), @@ -46,21 +62,6 @@ class HearingImpaired(SubtitleTextModification): #NReProcessor(re.compile(ur'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "", # name="HI_before_colon"), - # uppercase text before colon (at least 3 uppercase chars); at start or after a sentence, - # possibly with a dash in front; ignore anything ending with a quote - NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s-]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])' - ur'[A-ZÀ-Ž-_0-9\s\"\'&+]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "", - name="HI_before_colon_caps"), - - # any text before colon (at least 3 chars); at start or after a sentence, - # possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if - # a space is inside the text; ignore anything ending with a quote - NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s-]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])' - ur'[A-zÀ-ž-_0-9\s\"\'&+]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'), - lambda match: - match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0) - else "" if not match.group(1).startswith(" ") else " ", - name="HI_before_colon_noncaps"), # text in brackets at start, after optional dash, before colon or at end of line # fixme: may be too aggressive