You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
170 lines
6.5 KiB
170 lines
6.5 KiB
import typing
|
|
|
|
from babelfish import (
|
|
COUNTRIES,
|
|
Country,
|
|
CountryReverseError,
|
|
LANGUAGE_MATRIX,
|
|
Language,
|
|
LanguageReverseError,
|
|
SCRIPTS,
|
|
Script,
|
|
country_converters,
|
|
language_converters
|
|
)
|
|
from babelfish.converters import CaseInsensitiveDict
|
|
|
|
from rebulk import Rebulk
|
|
from rebulk.match import Match
|
|
|
|
from trakit.config import Config
|
|
from trakit.context import Context
|
|
from trakit.converters.country import GuessCountryConverter
|
|
from trakit.converters.language import GuessLanguageConverter
|
|
from trakit.words import blank_match, blank_release_names, to_combinations, to_match, to_sentence, to_words
|
|
|
|
|
|
class LanguageFinder:
|
|
|
|
def __init__(self, config: Config):
|
|
self.country_max_words = 1
|
|
for k, v in COUNTRIES.items():
|
|
self.country_max_words = max(self.country_max_words, v.count(' '))
|
|
|
|
self.language_max_words = 1
|
|
for v in LANGUAGE_MATRIX:
|
|
self.language_max_words = max(self.language_max_words, v.name.count(' '))
|
|
|
|
self.script_max_words = 1
|
|
for v in config.scripts.keys():
|
|
self.script_max_words = max(self.script_max_words, v.count(' '))
|
|
|
|
self.region_max_words = 1
|
|
for v in config.regions.keys():
|
|
self.region_max_words = max(self.region_max_words, v.count(' '))
|
|
|
|
SCRIPTS['419'] = 'Latin America and the Caribbean' # Until babelfish support UN.M49
|
|
country_converters['guess'] = GuessCountryConverter(config.countries)
|
|
language_converters['guess'] = GuessLanguageConverter(config.languages)
|
|
self.regions = CaseInsensitiveDict(config.regions)
|
|
self.scripts = CaseInsensitiveDict(config.scripts)
|
|
self.common_words = CaseInsensitiveDict(dict.fromkeys(config.ignored, 0))
|
|
self.implicit = CaseInsensitiveDict(config.implicit_languages)
|
|
|
|
def _find_country(self, value: str):
|
|
combinations = to_combinations(to_words(value), self.country_max_words)
|
|
for c in combinations:
|
|
code = to_sentence(c)
|
|
try:
|
|
return to_match(c, Country.fromguess(code))
|
|
except CountryReverseError:
|
|
continue
|
|
|
|
def _find_script(self, value: str):
|
|
combinations = to_combinations(to_words(value), self.script_max_words)
|
|
for c in combinations:
|
|
code = to_sentence(c)
|
|
try:
|
|
return to_match(c, Script(self.scripts.get(code, code)))
|
|
except ValueError:
|
|
continue
|
|
|
|
def _find_region(self, value: str):
|
|
combinations = to_combinations(to_words(value), self.region_max_words)
|
|
for c in combinations:
|
|
code = to_sentence(c)
|
|
try:
|
|
return to_match(c, Script(self.regions.get(code, code)))
|
|
except ValueError:
|
|
continue
|
|
|
|
def _find_implicit_language(self, combinations: typing.List[typing.List[Match]]):
|
|
for c in combinations:
|
|
sentence = to_sentence(c)
|
|
if sentence in self.implicit:
|
|
return to_match(c, Language.fromietf(self.implicit[sentence]))
|
|
|
|
region = self._find_region(sentence)
|
|
if region and region.value.code in self.implicit:
|
|
lang = Language.fromietf(self.implicit[region.value.code])
|
|
return Match(region.start, region.end, value=lang, input_string=region.input_string)
|
|
|
|
try:
|
|
country = Country.fromguess(sentence)
|
|
if country.alpha2 in self.implicit:
|
|
lang = Language.fromietf(self.implicit[country.alpha2])
|
|
if lang.name.lower() == sentence.lower():
|
|
lang = Language.fromname(sentence)
|
|
|
|
return to_match(c, lang)
|
|
except CountryReverseError:
|
|
pass
|
|
|
|
def accept_word(self, string: str):
|
|
return string.lower() not in self.common_words and not string.isnumeric()
|
|
|
|
def find_language(self, value: str, context: Context):
|
|
value = blank_release_names(value)
|
|
all_words = to_words(value, predicate=self.accept_word)
|
|
combinations = to_combinations(all_words, self.language_max_words)
|
|
implicit_lang = self._find_implicit_language(combinations)
|
|
implicit_accepted = implicit_lang and context.accept(implicit_lang.value)
|
|
|
|
if implicit_accepted and implicit_lang.value.script and implicit_lang.value.script.code.isnumeric():
|
|
return implicit_lang
|
|
elif implicit_lang and not implicit_accepted:
|
|
value = blank_match(implicit_lang)
|
|
all_words = to_words(value, predicate=self.accept_word)
|
|
combinations = to_combinations(all_words, self.language_max_words)
|
|
|
|
for c in combinations:
|
|
language_sentence = to_sentence(c)
|
|
try:
|
|
lang = Language.fromguess(language_sentence)
|
|
except LanguageReverseError:
|
|
continue
|
|
|
|
match_lang = to_match(c, lang)
|
|
remaining_sentence = blank_match(match_lang)
|
|
for combination in to_combinations(to_words(remaining_sentence), self.country_max_words):
|
|
sentence = to_sentence(combination)
|
|
country = self._find_country(sentence)
|
|
if country:
|
|
try:
|
|
# discard country if value is actually the language name
|
|
Language.fromguess(country.raw)
|
|
except LanguageReverseError:
|
|
lang = Language(lang.alpha3, country=country.value, script=lang.script)
|
|
break
|
|
|
|
region = self._find_region(sentence)
|
|
if region:
|
|
lang = Language(lang.alpha3, country=lang.country, script=region.value)
|
|
break
|
|
|
|
script = self._find_script(sentence)
|
|
if script:
|
|
lang = Language(lang.alpha3, country=lang.country, script=script.value)
|
|
break
|
|
|
|
if implicit_accepted and implicit_lang.value.alpha3 == lang.alpha3 and not lang.country and not lang.script:
|
|
return implicit_lang
|
|
|
|
if context.accept(lang):
|
|
return to_match(c, lang)
|
|
|
|
if implicit_accepted:
|
|
return implicit_lang
|
|
|
|
def find(self, value: str, context: Context):
|
|
match = self.find_language(value, context)
|
|
if match:
|
|
return match.start, match.end, {'value': match.value}
|
|
|
|
|
|
def language(config: Config):
|
|
rebulk = Rebulk()
|
|
rebulk.functional(LanguageFinder(config).find, name='language')
|
|
|
|
return rebulk
|