Trying to fix Segmentation fault caused by mediainfo in docker container. #2098

pull/2110/head v1.2.1-beta.9
morpheus65535 2 years ago
parent 7136383098
commit 7455496c4c

@ -1,10 +1,9 @@
"""Know your media files better."""
__title__ = 'knowit'
__version__ = '0.4.0'
__short_version__ = '.'.join(__version__.split('.')[:2])
__version__ = '0.5.2'
__short_version__ = '0.5'
__author__ = 'Rato AQ2'
__license__ = 'MIT'
__copyright__ = 'Copyright 2016-2021, Rato AQ2'
__url__ = 'https://github.com/ratoaq2/knowit'
#: Video extensions

@ -169,7 +169,7 @@ def dumps(
return convert(info, context)
def main(args: typing.List[str] = None) -> None:
def main(args: typing.Optional[typing.List[str]] = None) -> None:
"""Execute main function for entry point."""
argument_parser = build_argument_parser()
args = args or sys.argv[1:]

@ -65,7 +65,7 @@ def know(
raise KnowitException(debug_info(context=context, exc_info=True))
def dependencies(context: typing.Mapping = None) -> typing.Mapping:
def dependencies(context: typing.Optional[typing.Mapping] = None) -> typing.Mapping:
"""Return all dependencies detected by knowit."""
deps = {}
try:

@ -63,6 +63,17 @@ class Property(Reportable[T]):
# Used to detect duplicated values. e.g.: en / en or High@L4.0 / High@L4.0 or Progressive / Progressive
self.delimiter = delimiter
@classmethod
def _extract_value(cls,
track: typing.Mapping,
name: str,
names: typing.List[str]):
if len(names) == 2:
parent_value = track.get(names[0], track.get(names[0].upper(), {}))
return parent_value.get(names[1], parent_value.get(names[1].upper()))
return track.get(name, track.get(name.upper()))
def extract_value(
self,
track: typing.Mapping,
@ -71,7 +82,7 @@ class Property(Reportable[T]):
"""Extract the property value from a given track."""
for name in self.names:
names = name.split('.')
value = track.get(names[0], {}).get(names[1]) if len(names) == 2 else track.get(name)
value = self._extract_value(track, name, names)
if value is None:
if self.default is None:
continue
@ -216,9 +227,10 @@ class MultiValue(Property):
class Rule(Reportable[T]):
"""Rule abstract class."""
def __init__(self, name: str, override=False, **kwargs):
def __init__(self, name: str, private=False, override=False, **kwargs):
"""Initialize the object."""
super().__init__(name, **kwargs)
self.private = private
self.override = override
def execute(self, props, pv_props, context: typing.Mapping):

@ -455,46 +455,46 @@ profiles:
VideoProfileLevel:
L1:
default: "1"
default: '1'
technical: Level 1
L11:
default: "1.1"
default: '1.1'
technical: Level 1.1
L13:
default: "1.3"
default: '1.3'
technical: Level 1.3
L2:
default: "2"
default: '2'
technical: Level 2
L21:
default: "2.1"
default: '2.1'
technical: Level 2.1
L22:
default: "2.2"
default: '2.2'
technical: Level 2.2
L3:
default: "3"
default: '3'
technical: Level 3
L31:
default: "3.1"
default: '3.1'
technical: Level 3.1
L32:
default: "3.2"
default: '3.2'
technical: Level 3.2
L4:
default: "4"
default: '4'
technical: Level 4
L41:
default: "4.1"
default: '4.1'
technical: Level 4.1
L42:
default: "4.2"
default: '4.2'
technical: Level 4.2
L5:
default: "5"
default: '5'
technical: Level 5
L51:
default: "5.1"
default: '5.1'
technical: Level 5.1
LOW:
default: Low

@ -106,11 +106,12 @@ class Ratio(Property[Decimal]):
if (width, height) == ('0', '1'): # identity
return Decimal('1.0')
result = round_decimal(Decimal(width) / Decimal(height), min_digits=1, max_digits=3)
if self.unit:
result *= self.unit
if height:
result = round_decimal(Decimal(width) / Decimal(height), min_digits=1, max_digits=3)
if self.unit:
result *= self.unit
return result
return result
self.report(value, context)
return None

@ -103,10 +103,7 @@ class Provider:
value = prop.extract_value(track, context)
if value is not None:
if not prop.private:
which = props
else:
which = pv_props
which = props if not prop.private else pv_props
which[name] = value
for name, rule in self.rules.get(track_type, {}).items():
@ -116,8 +113,9 @@ class Provider:
value = rule.execute(props, pv_props, context)
if value is not None:
props[name] = value
elif name in props and not rule.override:
which = props if not rule.private else pv_props
which[name] = value
elif name in props and (not rule.override or props[name] is None):
del props[name]
return props

@ -26,6 +26,7 @@ from knowit.rules import (
LanguageRule,
ResolutionRule,
)
from knowit.rules.general import GuessTitleRule
from knowit.serializer import get_json_encoder
from knowit.units import units
from knowit.utils import to_dict
@ -83,17 +84,20 @@ class EnzymeProvider(Provider):
},
}, {
'video': {
'language': LanguageRule('video language'),
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('video language', override=True),
'resolution': ResolutionRule('video resolution'),
},
'audio': {
'language': LanguageRule('audio language'),
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('audio language', override=True),
'channels': AudioChannelsRule('audio channels'),
},
'subtitle': {
'language': LanguageRule('subtitle language'),
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'),
'closed_caption': ClosedCaptionRule('closed caption'),
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('subtitle language', override=True),
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True),
'closed_caption': ClosedCaptionRule('closed caption', override=True),
}
})
@ -130,7 +134,8 @@ class EnzymeProvider(Provider):
if logger.level == logging.DEBUG:
logger.debug('Video {video_path} scanned using Enzyme {version} has raw data:\n{data}',
video_path=video_path, version=enzyme.__version__, data=json.dumps(data))
video_path=video_path, version=enzyme.__version__,
data=json.dumps(data, cls=get_json_encoder(context), indent=4, ensure_ascii=False))
result = self._describe_tracks(video_path, data.get('info', {}), data.get('video_tracks'),
data.get('audio_tracks'), data.get('subtitle_tracks'), context)

@ -34,6 +34,7 @@ from knowit.rules import (
LanguageRule,
ResolutionRule,
)
from knowit.rules.general import GuessTitleRule
from knowit.serializer import get_json_encoder
from knowit.units import units
from knowit.utils import (
@ -77,7 +78,7 @@ class FFmpegExecutor:
def extract_info(self, filename):
"""Extract media info."""
json_dump = self._execute(filename)
return json.loads(json_dump)
return json.loads(json_dump) if json_dump else {}
def _execute(self, filename):
raise NotImplementedError
@ -144,7 +145,7 @@ class FFmpegProvider(Provider):
'id': Basic('index', data_type=int, allow_fallback=True, description='video track number'),
'name': Property('tags.title', description='video track name'),
'language': Language('tags.language', description='video language'),
'duration': Duration('duration', description='video duration'),
'duration': Duration('duration', 'tags.duration', description='video duration'),
'width': Quantity('width', unit=units.pixel),
'height': Quantity('height', unit=units.pixel),
'scan_type': ScanType(config, 'field_order', default='Progressive', description='video scan type'),
@ -153,7 +154,7 @@ class FFmpegProvider(Provider):
'resolution': None, # populated with ResolutionRule
'frame_rate': Ratio('r_frame_rate', unit=units.FPS, description='video frame rate'),
# frame_rate_mode
'bit_rate': Quantity('bit_rate', unit=units.bps, description='video bit rate'),
'bit_rate': Quantity('bit_rate', 'tags.bps', unit=units.bps, description='video bit rate'),
'bit_depth': Quantity('bits_per_raw_sample', unit=units.bit, description='video bit depth'),
'codec': VideoCodec(config, 'codec_name', description='video codec'),
'profile': VideoProfile(config, 'profile', description='video codec profile'),
@ -166,13 +167,13 @@ class FFmpegProvider(Provider):
'id': Basic('index', data_type=int, allow_fallback=True, description='audio track number'),
'name': Property('tags.title', description='audio track name'),
'language': Language('tags.language', description='audio language'),
'duration': Duration('duration', description='audio duration'),
'duration': Duration('duration', 'tags.duration', description='audio duration'),
'codec': AudioCodec(config, 'profile', 'codec_name', description='audio codec'),
'profile': AudioProfile(config, 'profile', description='audio codec profile'),
'channels_count': AudioChannels('channels', description='audio channels count'),
'channels': None, # populated with AudioChannelsRule
'bit_depth': Quantity('bits_per_raw_sample', unit=units.bit, description='audio bit depth'),
'bit_rate': Quantity('bit_rate', unit=units.bps, description='audio bit rate'),
'bit_rate': Quantity('bit_rate', 'tags.bps', unit=units.bps, description='audio bit rate'),
'sampling_rate': Quantity('sample_rate', unit=units.Hz, description='audio sampling rate'),
'forced': YesNo('disposition.forced', hide_value=False, description='audio track forced'),
'default': YesNo('disposition.default', hide_value=False, description='audio track default'),
@ -190,17 +191,20 @@ class FFmpegProvider(Provider):
},
}, {
'video': {
'language': LanguageRule('video language'),
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('video language', override=True),
'resolution': ResolutionRule('video resolution'),
},
'audio': {
'language': LanguageRule('audio language'),
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('audio language', override=True),
'channels': AudioChannelsRule('audio channels'),
},
'subtitle': {
'language': LanguageRule('subtitle language'),
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'),
'closed_caption': ClosedCaptionRule('closed caption'),
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('subtitle language', override=True),
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True),
'closed_caption': ClosedCaptionRule('closed caption', override=True),
},
})
self.executor = FFmpegExecutor.get_executor_instance(suggested_path)

@ -1,5 +1,6 @@
import ctypes
import json
import os
import re
from ctypes import c_void_p, c_wchar_p
from decimal import Decimal
@ -43,6 +44,7 @@ from knowit.rules import (
LanguageRule,
ResolutionRule,
)
from knowit.rules.general import GuessTitleRule
from knowit.units import units
from knowit.utils import (
define_candidate,
@ -77,7 +79,7 @@ class MediaInfoExecutor:
locations = {
'unix': ('/usr/local/mediainfo/lib', '/usr/local/mediainfo/bin', '__PATH__'),
'windows': ('__PATH__', ),
'windows': ('C:\\Program Files\\MediaInfo', 'C:\\Program Files (x86)\\MediaInfo', '__PATH__'),
'macos': ('__PATH__', ),
}
@ -121,12 +123,28 @@ class MediaInfoCliExecutor(MediaInfoExecutor):
}
def _execute(self, filename):
return json.loads(check_output([self.location, '--Output=JSON', '--Full', filename]).decode())
data = check_output([self.location, '--Output=JSON', '--Full', filename]).decode()
return json.loads(data) if data else {}
@classmethod
def _is_gui_exe(cls, candidate: str):
if not candidate.endswith('MediaInfo.exe') or not os.path.isfile(candidate):
return False
try:
shell32 = ctypes.WinDLL('shell32', use_last_error=True) # type: ignore
return bool(shell32.ExtractIconExW(candidate, 0, None, None, 1))
except Exception:
return False
@classmethod
def create(cls, os_family=None, suggested_path=None):
"""Create the executor instance."""
for candidate in define_candidate(cls.locations, cls.names, os_family, suggested_path):
if cls._is_gui_exe(candidate):
continue
try:
output = check_output([candidate, '--version']).decode()
version = cls._get_version(output)
@ -154,7 +172,9 @@ class MediaInfoCTypesExecutor(MediaInfoExecutor):
def _execute(self, filename):
# Create a MediaInfo handle
return json.loads(MediaInfo.parse(filename, library_file=self.location, output='JSON'))
data = MediaInfo.parse(filename, library_file=self.location, output='JSON')
return json.loads(data) if data else {}
@classmethod
def create(cls, os_family=None, suggested_path=None):
@ -254,19 +274,22 @@ class MediaInfoProvider(Provider):
},
}, {
'video': {
'language': LanguageRule('video language'),
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('video language', override=True),
'resolution': ResolutionRule('video resolution'),
},
'audio': {
'language': LanguageRule('audio language'),
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('audio language', override=True),
'channels': AudioChannelsRule('audio channels'),
'_atmosrule': AtmosRule(config, 'atmos rule'),
'_dtshdrule': DtsHdRule(config, 'dts-hd rule'),
'atmos': AtmosRule(config, 'atmos rule', private=True),
'dtshd': DtsHdRule(config, 'dts-hd rule', private=True),
},
'subtitle': {
'language': LanguageRule('subtitle language'),
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'),
'closed_caption': ClosedCaptionRule('closed caption'),
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('subtitle language', override=True),
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True),
'closed_caption': ClosedCaptionRule('closed caption', override=True),
}
})
self.executor = MediaInfoExecutor.get_executor_instance(suggested_path)

@ -28,6 +28,7 @@ from knowit.rules import (
LanguageRule,
ResolutionRule,
)
from knowit.rules.general import GuessTitleRule
from knowit.serializer import get_json_encoder
from knowit.units import units
from knowit.utils import define_candidate, detect_os
@ -67,7 +68,7 @@ class MkvMergeExecutor:
def extract_info(self, filename):
"""Extract media info."""
json_dump = self._execute(filename)
return json.loads(json_dump)
return json.loads(json_dump) if json_dump else {}
def _execute(self, filename):
raise NotImplementedError
@ -166,17 +167,20 @@ class MkvMergeProvider(Provider):
},
}, {
'video': {
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('video language', override=True),
'resolution': ResolutionRule('video resolution'),
},
'audio': {
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('audio language', override=True),
'channels': AudioChannelsRule('audio channels'),
},
'subtitle': {
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('subtitle language', override=True),
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'),
'closed_caption': ClosedCaptionRule('closed caption'),
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True),
'closed_caption': ClosedCaptionRule('closed caption', override=True),
}
})
self.executor = MkvMergeExecutor.get_executor_instance(suggested_path)

@ -1,8 +1,6 @@
import re
from logging import NullHandler, getLogger
import babelfish
from trakit.api import trakit
from knowit.core import Rule
@ -10,22 +8,27 @@ logger = getLogger(__name__)
logger.addHandler(NullHandler())
class GuessTitleRule(Rule):
"""Guess properties from track title."""
def execute(self, props, pv_props, context):
"""Language detection using name."""
if 'name' in props:
language = props.get('language')
options = {'expected_language': language} if language else {}
guessed = trakit(props['name'], options)
if guessed:
return guessed
class LanguageRule(Rule):
"""Language rules."""
name_re = re.compile(r'(?P<name>\w+)\b', re.IGNORECASE)
def execute(self, props, pv_props, context):
"""Language detection using name."""
if 'language' in props:
if 'guessed' not in pv_props:
return
if 'name' in props:
name = props.get('name', '')
match = self.name_re.match(name)
if match:
try:
return babelfish.Language.fromname(match.group('name'))
except babelfish.Error:
pass
logger.info('Invalid %s: %r', self.description, name)
guess = pv_props['guessed']
if 'language' in guess:
return guess['language']

@ -10,18 +10,19 @@ class ClosedCaptionRule(Rule):
def execute(self, props, pv_props, context):
"""Execute closed caption rule."""
for name in (pv_props.get('_closed_caption'), props.get('name')):
if name and self.cc_re.search(name):
return True
if '_closed_caption' in pv_props and self.cc_re.search(pv_props['_closed_caption']):
return True
if 'guessed' in pv_props:
guessed = pv_props['guessed']
return guessed.get('closed_caption')
class HearingImpairedRule(Rule):
"""Hearing Impaired rule."""
hi_re = re.compile(r'(\bsdh\b)', re.IGNORECASE)
def execute(self, props, pv_props, context):
"""Hearing Impaired."""
name = props.get('name')
if name and self.hi_re.search(name):
return True
if 'guessed' in pv_props:
guessed = pv_props['guessed']
return guessed.get('hearing_impaired')

@ -1,10 +1,5 @@
import typing
try:
import pint
except ImportError:
pint = False
class NullRegistry:
"""A NullRegistry that masquerades as a pint.UnitRegistry."""
@ -25,9 +20,18 @@ class NullRegistry:
def _build_unit_registry():
registry = pint.UnitRegistry() if pint else NullRegistry()
registry.define('FPS = 1 * hertz')
return registry
try:
import pint
registry = pint.UnitRegistry()
registry.define('FPS = 1 * hertz')
pint.set_application_registry(registry)
return registry
except ModuleNotFoundError:
pass
return NullRegistry()
units = _build_unit_registry()

@ -386,7 +386,7 @@ class MediaInfo:
A higher value will yield more precise results in some cases
but will also increase parsing time.
:param bool full: display additional tags, including computer-readable values
for sizes and durations.
for sizes and durations, corresponds to the CLI's ``--Full``/``-f`` parameter.
:param bool legacy_stream_display: display additional information about streams.
:param dict mediainfo_options: additional options that will be passed to the
`MediaInfo_Option` function, for example: ``{"Language": "raw"}``.

@ -0,0 +1,8 @@
__title__ = 'trakit'
__version__ = '0.2.1'
__short_version__ = '0.2'
__author__ = 'RatoAQ'
__license__ = 'MIT'
__url__ = 'https://github.com/ratoaq2/trakit'
from .api import TrakItApi, trakit

@ -0,0 +1,108 @@
import argparse
import json
import logging
import sys
import typing
import babelfish
from trakit import TrakItApi, __version__
logging.basicConfig(stream=sys.stdout, format='%(message)s')
logging.getLogger('CONSOLE').setLevel(logging.INFO)
logging.getLogger('trakit').setLevel(logging.WARNING)
console = logging.getLogger('CONSOLE')
logger = logging.getLogger('trakit')
def build_argument_parser() -> argparse.ArgumentParser:
"""Build the argument parser."""
opts = argparse.ArgumentParser()
opts.add_argument(
dest='value',
help='track title to guess',
type=str,
)
conf_opts = opts.add_argument_group('Configuration')
conf_opts.add_argument(
'-l',
'--expected-language',
dest='expected_language',
help='The expected language to be guessed',
type=str,
)
output_opts = opts.add_argument_group('Output')
output_opts.add_argument(
'--debug',
action='store_true',
dest='debug',
help='Print information for debugging trakit and for reporting bugs.'
)
output_opts.add_argument(
'-y',
'--yaml',
action='store_true',
dest='yaml',
help='Display output in yaml format'
)
information_opts = opts.add_argument_group('Information')
information_opts.add_argument('--version', action='version', version=__version__)
return opts
def _as_yaml(value: str, info: typing.Mapping[str, typing.Any]) -> str:
"""Convert info to string using YAML format."""
import yaml
def default_representer(r: yaml.representer.SafeRepresenter, data: typing.Any):
return r.represent_scalar('tag:yaml.org,2002:str', str(data))
yaml.representer.SafeRepresenter.add_representer(babelfish.Language, default_representer)
return yaml.safe_dump({value: dict(info)}, allow_unicode=True, sort_keys=False)
def _as_json(info: typing.Mapping[str, typing.Any]) -> str:
"""Convert info to string using JSON format."""
return json.dumps(info, ensure_ascii=False, indent=2, default=str)
def dump(value: str, info: typing.Mapping[str, typing.Any], opts: argparse.Namespace) -> str:
"""Convert info to string using json or yaml format."""
if opts.yaml:
return _as_yaml(value, info)
return _as_json(info)
def trakit(value: str, opts: argparse.Namespace) -> typing.Mapping:
"""Extract video metadata."""
if not opts.yaml:
console.info('Parsing: %s', value)
options = {k: v for k, v in vars(opts).items() if v is not None}
info = TrakItApi().trakit(value, options)
console.info('TrakIt %s found: ', __version__)
console.info(dump(value, info, opts))
return info
def main(args: typing.Optional[typing.List[str]] = None):
"""Execute main function for entry point."""
argument_parser = build_argument_parser()
args = args or sys.argv[1:]
opts = argument_parser.parse_args(args)
if opts.debug:
logger.setLevel(logging.DEBUG)
logging.getLogger('rebulk').setLevel(logging.DEBUG)
return trakit(opts.value, opts)
if __name__ == '__main__':
main(sys.argv[1:])

@ -0,0 +1,24 @@
import typing
from trakit.config import Config
from trakit.context import Context
from trakit.patterns import configure
class TrakItApi:
def __init__(self, config: typing.Optional[typing.Mapping[str, typing.Any]] = None):
self.rebulk = configure(Config(config))
def trakit(self, string: str, options: typing.Optional[typing.Mapping[str, typing.Any]] = None):
"""Return a mapping of extracted information."""
matches = self.rebulk.matches(string, Context(options))
guess: typing.Mapping[str, typing.Any] = matches.to_dict()
return guess
default_api = TrakItApi()
def trakit(string: str, options: typing.Optional[typing.Mapping[str, typing.Any]] = None):
return default_api.trakit(string, options)

@ -0,0 +1,19 @@
import json
import typing
from pkg_resources import resource_stream
class Config:
def __init__(self, config: typing.Optional[typing.Mapping[str, typing.Any]]):
with resource_stream('trakit', 'data/config.json') as f:
cfg: typing.Dict[str, typing.Any] = json.load(f)
if config:
cfg.update(config)
self.ignored: typing.Set[str] = set(cfg.get('ignored', []))
self.countries: typing.Mapping[str, str] = cfg.get('countries', {})
self.languages: typing.Mapping[str, str] = cfg.get('languages', {})
self.scripts: typing.Mapping[str, str] = cfg.get('scripts', {})
self.regions: typing.Mapping[str, str] = cfg.get('regions', {})
self.implicit_languages: typing.Mapping[str, str] = cfg.get('implicit-languages', {})

@ -0,0 +1,22 @@
import typing
import babelfish
class Context(dict):
def __init__(self, options: typing.Optional[typing.Mapping[str, typing.Any]] = None):
super().__init__(options or {})
language = self['expected_language'] if 'expected_language' in self else None
if language and not isinstance(language, babelfish.Language):
language = babelfish.Language.fromietf(str(language))
self.expected_language: typing.Optional[babelfish.Language] = language
def accept(self, lang: babelfish.Language):
if self.expected_language is None:
return True
if self.expected_language.alpha3 != lang.alpha3:
return False
if self.expected_language.script and self.expected_language != lang.script:
return False
return not self.expected_language.country or self.expected_language == lang.country

@ -0,0 +1,32 @@
import typing
from babelfish import Country, CountryReverseConverter, CountryReverseError
from babelfish.converters import CaseInsensitiveDict
class GuessCountryConverter(CountryReverseConverter):
def __init__(self, config: typing.Mapping[str, str]):
self.synonyms = CaseInsensitiveDict(config)
def convert(self, alpha2):
return str(Country(alpha2))
def reverse(self, name: str):
try:
return self.synonyms[name]
except KeyError:
pass
if name.isupper() and len(name) == 2:
try:
return Country(name).alpha2
except ValueError:
pass
for conv in (Country.fromname,):
try:
return conv(name).alpha2
except CountryReverseError:
pass
raise CountryReverseError(name)

@ -0,0 +1,30 @@
import typing
from babelfish import Language, LanguageReverseConverter, LanguageReverseError
from babelfish.converters import CaseInsensitiveDict
class GuessLanguageConverter(LanguageReverseConverter):
def __init__(self, config: typing.Mapping[str, str]):
self.synonyms = CaseInsensitiveDict()
for synonym, code in config.items():
lang = Language.fromietf(code) if '-' in code else Language(code)
self.synonyms[synonym] = (lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script)
def convert(self, alpha3: str, country=None, script=None):
return str(Language(alpha3, country, script))
def reverse(self, name: str):
try:
return self.synonyms[name]
except KeyError:
pass
for conv in (Language.fromname,):
try:
reverse = conv(name)
return reverse.alpha3, reverse.country, reverse.script
except (ValueError, LanguageReverseError):
pass
raise LanguageReverseError(name)

@ -0,0 +1,860 @@
{
"countries": {
"Afghan": "AF",
"Aforika Borwa": "ZA",
"Afrika Borwa": "ZA",
"Afrika Dzonga": "ZA",
"Afurika Tshipembe": "ZA",
"Aland": "AX",
"Alandish": "AX",
"Albanian": "AL",
"Algerian": "DZ",
"American": "US",
"American Islander": "UM",
"American Samoan": "AS",
"American Virgin Islander": "VI",
"Andorran": "AD",
"Angolan": "AO",
"Anguillian": "AI",
"Antarctican": "AQ",
"Antiguan Barbudan": "AG",
"Ao Men": "MO",
"Aotearoa": "NZ",
"Argentine": "AR",
"Armenian": "AM",
"Aruban": "AW",
"Australian": "AU",
"Austrian": "AT",
"Ayiti": "HT",
"Azerbaidzhan": "AZ",
"Azerbaijani": "AZ",
"Azərbaycan": "AZ",
"Bahamian": "BS",
"Bahraini": "BH",
"Bangladeshi": "BD",
"Barbadian": "BB",
"Beafrika": "CF",
"Belarusian": "BY",
"Belau": "PW",
"Belgian": "BE",
"Belgie": "BE",
"Belgien": "BE",
"Belgique": "BE",
"België": "BE",
"Belice": "BZ",
"Belizean": "BZ",
"Beninese": "BJ",
"Bermudian": "BM",
"Bhutanese": "BT",
"Blgariia": "BG",
"Bolivia": "BO",
"Bolivian": "BO",
"Boneiru Sint Eustatius y Saba": "BQ",
"Bosna i Hercegovina": "BA",
"Bosna i Khertsegovina": "BA",
"Bosnian Herzegovinian": "BA",
"Bouvetoya": "BV",
"Bouvetøya": "BV",
"Brasil": "BR",
"Brazilian": "BR",
"British": "GB",
"British Virgin Islander": "VG",
"British Virgin Islands": "VG",
"Bruneian": "BN",
"Bulgarian": "BG",
"Buliwya": "BO",
"Burkinabe": "BF",
"Burmese": "MM",
"Burundian": "BI",
"Bénin": "BJ",
"Bêafrîka": "CF",
"Cabo Verde": "CV",
"Cambodian": "KH",
"Cameroonian": "CM",
"Cameroun": "CM",
"Canadian": "CA",
"Cape Verdian": "CV",
"Caribisch Nederland": "BQ",
"Caymanian": "KY",
"Central African": "CF",
"Cesko": "CZ",
"Chadian": "TD",
"Channel Islander": "JE",
"Chilean": "CL",
"Chinese": "CN",
"Christmas Islander": "CX",
"Cocos Islander": "CC",
"Cocos Keeling Islands": "CC",
"Colombian": "CO",
"Comoran": "KM",
"Comores": "KM",
"Congolese": "CD",
"Cook Islander": "CK",
"Costa Rican": "CR",
"Cote dIvoire": "CI",
"Croatian": "HR",
"Cuban": "CU",
"Curacao": "CW",
"Curacaoan": "CW",
"Curaçaoan": "CW",
"Cypriot": "CY",
"Czech": "CZ",
"Côte dIvoire": "CI",
"Danish": "DK",
"Danmark": "DK",
"Deutschland": "DE",
"Dgernesiais": "GG",
"Dgèrnésiais": "GG",
"Ditunga dia Kongu wa Mungalaata": "CD",
"Dominican": "DO",
"Dutch": "NL",
"East Timorese": "TL",
"Ecuadorean": "EC",
"Eesti": "EE",
"Egyptian": "EG",
"Eire": "IE",
"Ellada": "GR",
"Emirati": "AE",
"Equatorial Guinean": "GQ",
"Eritrean": "ER",
"Espana": "ES",
"España": "ES",
"Estados Unidos": "US",
"Estonian": "EE",
"Eswatini": "SZ",
"Ethiopian": "ET",
"Faereyjar": "FO",
"Faeroerne": "FO",
"Falkland Islander": "FK",
"Falkland Islands": "FK",
"Faroese": "FO",
"Fijian": "FJ",
"Filipino": "PH",
"Finnish": "FI",
"Foroyar": "FO",
"French": "FR",
"French Polynesian": "PF",
"Færeyjar": "FO",
"Færøerne": "FO",
"Føroyar": "FO",
"Gabonese": "GA",
"Gambian": "GM",
"Georgian": "GE",
"German": "DE",
"Ghanaian": "GH",
"Greek": "GR",
"Greenlandic": "GL",
"Grenadian": "GD",
"Guadeloupian": "GP",
"Guahan": "GU",
"Guamanian": "GU",
"Guatemalan": "GT",
"Guernesey": "GG",
"Guianan": "GF",
"Guine Bissau": "GW",
"Guine Equatorial": "GQ",
"Guinea Bissauan": "GW",
"Guinea Ecuatorial": "GQ",
"Guinean": "GN",
"Guinee": "GN",
"Guinee equatoriale": "GQ",
"Guiné Bissau": "GW",
"Guiné Equatorial": "GQ",
"Guinée": "GN",
"Guinée équatoriale": "GQ",
"Guyane francaise": "GF",
"Guyane française": "GF",
"Guyanese": "GY",
"Guåhån": "GU",
"Haitian": "HT",
"Hayastan": "AM",
"Haïti": "HT",
"Heard and McDonald Islander": "HM",
"Honduran": "HN",
"Hong Konger": "HK",
"Hrvatska": "HR",
"Hungarian": "HU",
"I Kiribati": "KI",
"Icelander": "IS",
"Indian": "IN",
"Indonesian": "ID",
"Iranian": "IR",
"Iraqi": "IQ",
"Irish": "IE",
"Island": "IS",
"Israeli": "IL",
"Italia": "IT",
"Italian": "IT",
"Ivorian": "CI",
"Jamaican": "JM",
"Jamhuri ya Kidemokrasia ya Kongo": "CD",
"Japanese": "JP",
"Jerri": "JE",
"Jordanian": "JO",
"Jèrri": "JE",
"Kalaallit Nunaat": "GL",
"Kampuchea": "KH",
"Kazakhstani": "KZ",
"Kazakstan": "KZ",
"Kenyan": "KE",
"Kibris": "CY",
"Kirghiz": "KG",
"Kirgiziia": "KG",
"Kittitian or Nevisian": "KN",
"Komori": "KM",
"Kuki Airani": "CK",
"Kupros": "CY",
"Kuwaiti": "KW",
"Kâmpŭchéa": "KH",
"Kıbrıs": "CY",
"Kūki Āirani": "CK",
"La Reunion": "RE",
"La Réunion": "RE",
"Laotian": "LA",
"Latvian": "LV",
"Latvija": "LV",
"Lebanese": "LB",
"Letzebuerg": "LU",
"Liban": "LB",
"Liberian": "LR",
"Libyan": "LY",
"Liechtensteiner": "LI",
"Lietuva": "LT",
"Lithuanian": "LT",
"Luxembourger": "LU",
"Luxemburg": "LU",
"Lëtzebuerg": "LU",
"Macanese": "MO",
"Macau": "MO",
"Macedonian": "MK",
"Madagasikara": "MG",
"Magyarorszag": "HU",
"Magyarország": "HU",
"Mahoran": "YT",
"Majel": "MH",
"Makedonija": "MK",
"Makedonski": "MK",
"Malagasy": "MG",
"Malawian": "MW",
"Malaysian": "MY",
"Malaŵi": "MW",
"Maldivan": "MV",
"Malian": "ML",
"Maltese": "MT",
"Mannin": "IM",
"Manx": "IM",
"Marshallese": "MH",
"Martinican": "MQ",
"Maurice": "MU",
"Mauritanian": "MR",
"Mauritian": "MU",
"Mexican": "MX",
"Micronesia": "FM",
"Micronesian": "FM",
"Mocambique": "MZ",
"Moldova": "MD",
"Moldovan": "MD",
"Monegasque": "MC",
"Mongol uls": "MN",
"Mongolian": "MN",
"Montenegrin": "ME",
"Montserratian": "MS",
"Moris": "MU",
"Moroccan": "MA",
"Mosotho": "LS",
"Motswana": "BW",
"Mozambican": "MZ",
"Moçambique": "MZ",
"Mzantsi Afrika": "ZA",
"México": "MX",
"M̧ajeļ": "MH",
"Na Islas Marianas": "MP",
"Na Islas Mariånas": "MP",
"Namibian": "NA",
"Namibie": "NA",
"Namibië": "NA",
"Nauruan": "NR",
"Nederland": "NL",
"Negara Brunei Darussalam": "BN",
"Nepalese": "NP",
"New Caledonian": "NC",
"New Zealander": "NZ",
"Ni Vanuatu": "VU",
"Nicaraguan": "NI",
"Nigerian": "NG",
"Nigerien": "NE",
"Ningizimu Afrika": "ZA",
"Niuean": "NU",
"Niuē": "NU",
"Noreg": "NO",
"Norfk Ailen": "NF",
"Norfolk Islander": "NF",
"Norge": "NO",
"Norgga": "NO",
"North Korean": "KP",
"Norwegian": "NO",
"Nouvelle Caledonie": "NC",
"Nouvelle Calédonie": "NC",
"Omani": "OM",
"Osterreich": "AT",
"Owganystan": "AF",
"Ozbekiston": "UZ",
"Ozbekiston": "UZ",
"Pais Korsou": "CW",
"Pais Kòrsou": "CW",
"Pakistani": "PK",
"Palauan": "PW",
"Palestinian": "PS",
"Panamanian": "PA",
"Panamá": "PA",
"Papua New Guinean": "PG",
"Papua Niu Gini": "PG",
"Papua Niugini": "PG",
"Paraguai": "PY",
"Paraguayan": "PY",
"Paraguái": "PY",
"Peruvian": "PE",
"Perú": "PE",
"Pilipinas": "PH",
"Piruw": "PE",
"Pitcairn Islander": "PN",
"Pitcairn Islands": "PN",
"Polish": "PL",
"Polska": "PL",
"Polynesie francaise": "PF",
"Polynésie française": "PF",
"Portuguese": "PT",
"Puerto Rican": "PR",
"Qatari": "QA",
"RD Congo": "CD",
"Repubilika ya Kongo": "CG",
"Repubilika ya Kongo Demokratiki": "CD",
"Republica Dominicana": "DO",
"Republiki ya Kongo": "CG",
"Republiki ya Kongo Demokratiki": "CD",
"Republiki ya Kongó Demokratiki": "CD",
"Republique centrafricaine": "CF",
"Republique du Congo": "CG",
"Republíki ya Kongó": "CG",
"República Dominicana": "DO",
"Reunionese": "RE",
"Ri Ben": "JP",
"Romanian": "RO",
"România": "RO",
"Rossiia": "RU",
"Russian": "RU",
"Rwandan": "RW",
"République centrafricaine": "CF",
"République du Congo": "CG",
"Réunionese": "RE",
"Sahara Occidental": "EH",
"Sahrawi": "EH",
"Saint Barthelemy": "BL",
"Saint Barthelemy Islander": "BL",
"Saint Barthélemy Islander": "BL",
"Saint Helena Ascension and Tristan da Cunha": "SH",
"Saint Helenian": "SH",
"Saint Lucian": "LC",
"Saint Martin": "MF",
"Saint Martin Islander": "MF",
"Saint Pierrais Miquelonnais": "PM",
"Saint Pierre et Miquelon": "PM",
"Saint Vincentian": "VC",
"Salvadoran": "SV",
"Sammarinese": "SM",
"Samoa Amelika": "AS",
"Samoan": "WS",
"Sao Tome e Principe": "ST",
"Sao Tomean": "ST",
"Saudi Arabian": "SA",
"Schweiz": "CH",
"Senegalese": "SN",
"Serbian": "RS",
"Sesel": "SC",
"Sewula Afrika": "ZA",
"Seychellois": "SC",
"Shqiperia": "AL",
"Shqipëria": "AL",
"Sierra Leonean": "SL",
"Singaporean": "SG",
"Singapura": "SG",
"Sint Maarten": "SX",
"Slovak": "SK",
"Slovene": "SI",
"Slovenija": "SI",
"Slovensko": "SK",
"Solomon Islander": "SB",
"Somali": "SO",
"Soomaaliya": "SO",
"South African": "ZA",
"South Georgia": "GS",
"South Georgian South Sandwich Islander": "GS",
"South Korean": "KR",
"South Sudanese": "SS",
"Spanish": "ES",
"Srbija": "RS",
"Sri Lankan": "LK",
"St Maartener": "SX",
"Sudanese": "SD",
"Suisse": "CH",
"Suomi": "FI",
"Surinamer": "SR",
"Svalbard og Jan Mayen": "SJ",
"Sverige": "SE",
"Svizra": "CH",
"Svizzera": "CH",
"Swazi": "SZ",
"Swedish": "SE",
"Swiss": "CH",
"Syrian": "SY",
"São Tomé e Príncipe": "ST",
"Sénégal": "SN",
"Sāmoa": "WS",
"Sāmoa Amelika": "AS",
"Tadzhik": "TJ",
"Tadzhikistan": "TJ",
"Tai Wan": "TW",
"Taiwanese": "TW",
"Tanzania": "TZ",
"Tanzanian": "TZ",
"Tchad": "TD",
"Terres australes et antarctiques francaises": "TF",
"Terres australes et antarctiques françaises": "TF",
"Thai": "TH",
"Timor Leste": "TL",
"Timór Leste": "TL",
"Tochikiston": "TJ",
"Togolese": "TG",
"Tokelauan": "TK",
"Tongan": "TO",
"Trinidadian": "TT",
"Tsrna Gora": "ME",
"Tunisian": "TN",
"Turkish": "TR",
"Turkiye": "TR",
"Turkmen": "TM",
"Turkmeniia": "TM",
"Turks and Caicos Islander": "TC",
"Tuvaluan": "TV",
"Türkiye": "TR",
"Türkmenistan": "TM",
"UK": "GB",
"US": "US",
"Uburundi": "BI",
"Ugandan": "UG",
"Ukrainian": "UA",
"Ukrayina": "UA",
"United States Virgin Islands": "VI",
"Uruguayan": "UY",
"Uzbekistani": "UZ",
"Vatican": "VA",
"Vaticanae": "VA",
"Vaticano": "VA",
"Vaticanæ": "VA",
"Venezuela": "VE",
"Venezuelan": "VE",
"Vietnam": "VN",
"Vietnamese": "VN",
"Viti": "FJ",
"Việt Nam": "VN",
"Volivia": "BO",
"Volívia": "BO",
"Wallis and Futuna Islander": "WF",
"Wallis et Futuna": "WF",
"Wuliwya": "BO",
"Xiang Gang": "HK",
"Xin Jia Po": "SG",
"Yemeni": "YE",
"Zambian": "ZM",
"Zhong Guo": "CN",
"Zhong Guo Da Lu": "CN",
"Zimbabwean": "ZW",
"`mn": "OM",
"baaNlaadesh": "BD",
"bbaart nuuN": "IN",
"bhaart": "IN",
"brug-yul-": "BT",
"canadien": "CA",
"cingkppuur": "SG",
"dhivehiraajeyge": "MV",
"eSwatini": "SZ",
"eereteraa": "ER",
"fGnstn": "AF",
"flsTyn": "PS",
"hangug": "KR",
"ilngkai": "LK",
"intiyaa": "IN",
"joseon": "KP",
"jybwty": "DJ",
"khoemry": "IQ",
"lSwml": "SO",
"l`rq": "IQ",
"lbHryn": "BH",
"lbnn": "LB",
"ljzyr": "DZ",
"lkwyt": "KW",
"lmGrb": "MA",
"lqmr": "KM",
"lrdn": "JO",
"lswdn": "SD",
"lyaman": "YE",
"lyby": "LY",
"mSr": "EG",
"mlysy": "MY",
"mnmaa": "MM",
"mwrytny": "MR",
"nepaal": "NP",
"phijii": "FJ",
"pkstn": "PK",
"praethsaithy": "TH",
"qTr": "QA",
"qwutnA": "IQ",
"rtry": "ER",
"sak`art`velo": "GE",
"shrii lNkaav": "LK",
"spplaaw": "LA",
"sryyl": "IL",
"swry": "SY",
"teyopheyaa": "ET",
"tshd": "TD",
"twns": "TN",
"ySHrAl": "IL",
"yrn": "IR",
"Åland": "AX",
"Ålandish": "AX",
"Éire": "IE",
"Ísland": "IS",
"Österreich": "AT",
"Česko": "CZ",
"Ελλάδα": "GR",
"Κύπρος": "CY",
"Азербайджан": "AZ",
"Белару́сь": "BY",
"Беларусь": "BY",
оснa и Херцеговина": "BA",
"България": "BG",
"Казахстан": "KZ",
"Киргизия": "KG",
"Кыргызстан": "KG",
"Македонија": "MK",
"Македонски": "MK",
"Монгол улс": "MN",
"Россия": "RU",
"Србија": "RS",
"Таджикистан": "TJ",
"Тоҷикистон": "TJ",
"Туркмения": "TM",
"Узбекистан": "UZ",
"Україна": "UA",
"Црна Гора": "ME",
"Қазақстан": "KZ",
"Հայաստան": "AM",
"ישראל": "IL",
"إرتريا‎": "ER",
"إسرائيل": "IL",
"افغانستان": "AF",
"الأردن": "JO",
"البحرين": "BH",
"الجزائر": "DZ",
"السعودية": "SA",
"السودان": "SD",
"الصحراء الغربية": "EH",
"الصومال‎‎": "SO",
"العراق": "IQ",
"العربية السعودية": "SA",
"القمر‎": "KM",
"الكويت": "KW",
"المغرب": "MA",
"اليَمَن": "YE",
"ایران": "IR",
"تشاد‎": "TD",
"تونس": "TN",
"جيبوتي‎": "DJ",
"دولة الإمارات العربية المتحدة": "AE",
"سوريا": "SY",
"عمان": "OM",
"فلسطين": "PS",
"قطر": "QA",
"لبنان": "LB",
"ليبيا": "LY",
"مصر": "EG",
"مليسيا": "MY",
"موريتانيا": "MR",
"پاكستان": "PK",
"کۆماری": "IQ",
"ܩܘܼܛܢܵܐ": "IQ",
"ދިވެހިރާއްޖޭގެ": "MV",
"नेपाल": "NP",
"फिजी": "FJ",
"भारत": "IN",
"বাংলাদেশ": "BD",
"ভারত": "IN",
"ਭਾਰਤ ਨੂੰ": "IN",
"இந்தியா": "IN",
"இலங்கை": "LK",
"சிங்கப்பூர்": "SG",
"ශ්‍රී ලංකාව": "LK",
"ประเทศไทย": "TH",
"ສປປລາວ": "LA",
"འབྲུག་ཡུལ་": "BT",
"မြန်မာ": "MM",
"საქართველო": "GE",
"ኢትዮጵያ": "ET",
"ኤርትራ": "ER",
"ⵍⵎⴰⵖⵔⵉⴱ": "MA",
"中国": "CN",
"中国大陆": "CN",
"台灣": "TW",
"新加坡": "SG",
"日本": "JP",
"澳门": "MO",
"香港": "HK",
"조선": "KP",
"한국": "KR"
},
"ignored": [
"bit",
"cc",
"ch",
"dan",
"day",
"gun",
"hr",
"jordan",
"la",
"ma",
"na",
"the",
"to"
],
"implicit-languages": {
"419": "es-419",
"BR": "pt-BR",
"CA": "fr-CA",
"Cantonese": "zh",
"Castilian": "es",
"FR": "fr-FR",
"GR": "ell",
"HK": "zh-HK",
"ID": "id-ID",
"Mandarin": "zh",
"Parisian": "fr-FR",
"Simplified": "zh-Hans",
"Traditional": "zh-Hant",
"UA": "uk-UA",
"UK": "en-GB",
"US": "en-US",
"VFF": "fr-FR",
"VFQ": "fr-CA",
"VN": "vie",
"cant": "zh",
"eng": "en",
"ita": "it",
"简体双语": "zh-Hans",
"繁体双语": "zh-Hant"
},
"languages": {
"Adygebze": "ady",
"Avanee": "grn",
"Avañeẽ": "grn",
"Aymar aru": "aym",
"Azərbaycan dili": "aze",
"Bahasa Indonesia": "ind",
"Bahasa Melayu": "msa",
"Basa Jawa": "jav",
"Basa Sunda": "sun",
"Belaruskaia": "bel",
"Blgarski": "bul",
"Bosanski": "bos",
"Brezhoneg": "bre",
"Catala": "cat",
"Català": "cat",
"Cestina": "ces",
"Cymraeg": "cym",
"Dansk": "dan",
"Davvisamegiella": "sme",
"Davvisámegiella": "sme",
"Deutsch": "deu",
"Dolnoserbscina": "dsb",
"Dolnoserbšćina": "dsb",
"Eesti": "est",
"Ellenika": "ell",
"Espanol": "spa",
"Espanol Latinoamerica": "es-419",
"Español": "spa",
"Español Latinoamérica": "es-419",
"Euskara": "eus",
"Foroyskt": "fao",
"Francais": "fra",
"Français": "fra",
"Frysk": "fry",
"Føroyskt": "fao",
"Gaeilge": "gle",
"Gaelg": "glv",
"Gaidhlig": "gla",
"Galego": "glg",
"Greek": "ell",
"Guang Dong Hua ": "zho",
"Gàidhlig": "gla",
"Hayeren": "hye",
"Hornjoserbscina": "hsb",
"Hornjoserbšćina": "hsb",
"Hrvatski": "hrv",
"Islenska": "isl",
"Italiano": "ita",
"Kazaksha": "kaz",
"Kernewek": "cor",
"Kiswahili": "swa",
"Kreyol": "hat",
"Kreyòl": "hat",
"Kurdi": "kur",
"Kurdî": "kur",
"Latviesu": "lav",
"Latviešu": "lav",
"Lemborgs": "lim",
"Letzebuergesch": "ltz",
"Lietuviu": "lit",
"Lietuvių": "lit",
"Lwo": "ach",
"Lèmbörgs": "lim",
"Lëtzebuergesch": "ltz",
"Magyar": "hun",
"Makedonski": "mkd",
"Malay": "msa",
"Malti": "mlt",
"Maya Kaqchikel": "cak",
"Melayu": "msa",
"Mongol": "mon",
"Nederlands": "nld",
"Norsk": "nor",
"Norsk bokmal": "nob",
"Norsk bokmål": "nob",
"Norsk nynorsk": "nno",
"Occitan": "oci",
"Ozbek": "uzb",
"Polski": "pol",
"Portugues": "por",
"Português": "por",
"Qhichwa": "que",
"Ri Ben Yu": "jpn",
"Romana": "ron",
"Română": "ron",
"Rumantsch": "roh",
"Russkii": "rus",
"Shqip": "sqi",
"Slovencina": "slk",
"Slovenscina": "slv",
"Slovenčina": "slk",
"Slovenščina": "slv",
"Soomaaliga": "som",
"Srpski": "srp",
"Suomi": "fin",
"Svenska": "swe",
"Taqbaylit": "kab",
"TcYi": "aka",
"Tieng Viet": "vie",
"Tiếng Việt": "vie",
"Turkce": "tur",
"Türkçe": "tur",
"Tɕɥi": "aka",
"Ukrayinska": "ukr",
"Zhong Wen": "zho",
"Zhong Wen Fan Ti": "zh-Hant",
"Zhong Wen Jian Ti": "zh-Hans",
"`bryt": "heb",
"aithy": "tha",
"baaNlaa": "ben",
"bhaasaakhmaer": "khm",
"bmaackaa": "mya",
"eesti keel": "est",
"frsy": "fas",
"gujraatii": "guj",
"hangugeo": "kor",
"hindii": "hin",
"isiXhosa": "xho",
"isiZulu": "zul",
"k`art`uli": "kat",
"knndd": "kan",
"maithilii maithilii": "mai",
"mlyaallN": "mal",
"mraatthii": "mar",
"nepaalii": "nep",
"oddiaa": "ori",
"pNjaabii": "pan",
"pStw": "pus",
"phaasaaaithy": "tha",
"rdw": "urd",
"sNskRtm": "san",
"siNhl": "sin",
"srpskokhrvatski": "hbs",
"tatarcha": "tat",
"telugu": "tel",
"tlhIngan Hol": "tlh",
"tmilll": "tam",
"tochiki": "tgk",
"yyidySH": "yid",
"zaboni tochiki": "tgk",
"Íslenska": "isl",
"Čeština": "ces",
"Ελληνικά": "ell",
"Адыгэбзэ": "ady",
"Беларуская": "bel",
"Български": "bul",
"Македонски": "mkd",
"Монгол": "mon",
"Русский": "rus",
"Српски": "srp",
"Українська": "ukr",
"забо́ни тоҷикӣ́": "tgk",
"српскохрватски": "hbs",
"татарча": "tat",
"тоҷикӣ": "tgk",
"Қазақша": "kaz",
"Հայերեն": "hye",
"ייִדיש": "yid",
"עברית": "heb",
"اردو": "urd",
"العربية": "ara",
"فارسی": "fas",
"پښتو": "pus",
"नेपाली": "nep",
"मराठी": "mar",
"मैथिली মৈথিলী": "mai",
"संस्कृतम्": "san",
"हिन्दी": "hin",
"বাংলা": "ben",
"ਪੰਜਾਬੀ": "pan",
"ગુજરાતી": "guj",
"ଓଡ଼ିଆ": "ori",
"தமிழ்": "tam",
"తెలుగు": "tel",
"ಕನ್ನಡ": "kan",
"മലയാളം": "mal",
"සිංහල": "sin",
"ภาษาไทย": "tha",
"ไทย": "tha",
"ဗမာစကာ": "mya",
"ქართული": "kat",
"ភាសាខ្មែរ": "khm",
"中文": "zho",
"中文简体": "zh-Hans",
"中文繁體": "zh-Hant",
"廣東話": "zho",
"日本語": "jpn",
"한국어": "kor"
},
"regions": {
"Latin": "419",
"Latinoamerica": "419",
"Latinoamericano": "419",
"Latinoamérica": "419"
},
"scripts": {
"Fan Ti ": "Hant",
"Jian Ti ": "Hans",
"Simplified": "Hans",
"Traditional": "Hant",
"简体": "Hans",
"繁體": "Hant"
}
}

@ -0,0 +1,169 @@
import typing
from babelfish import (
COUNTRIES,
Country,
CountryReverseError,
LANGUAGE_MATRIX,
Language,
LanguageReverseError,
SCRIPTS,
Script,
country_converters,
language_converters
)
from babelfish.converters import CaseInsensitiveDict
from rebulk import Rebulk
from rebulk.match import Match
from trakit.config import Config
from trakit.context import Context
from trakit.converters.country import GuessCountryConverter
from trakit.converters.language import GuessLanguageConverter
from trakit.words import blank_match, blank_release_names, to_combinations, to_match, to_sentence, to_words
class LanguageFinder:
def __init__(self, config: Config):
self.country_max_words = 1
for k, v in COUNTRIES.items():
self.country_max_words = max(self.country_max_words, v.count(' '))
self.language_max_words = 1
for v in LANGUAGE_MATRIX:
self.language_max_words = max(self.language_max_words, v.name.count(' '))
self.script_max_words = 1
for v in config.scripts.keys():
self.script_max_words = max(self.script_max_words, v.count(' '))
self.region_max_words = 1
for v in config.regions.keys():
self.region_max_words = max(self.region_max_words, v.count(' '))
SCRIPTS['419'] = 'Latin America and the Caribbean' # Until babelfish support UN.M49
country_converters['guess'] = GuessCountryConverter(config.countries)
language_converters['guess'] = GuessLanguageConverter(config.languages)
self.regions = CaseInsensitiveDict(config.regions)
self.scripts = CaseInsensitiveDict(config.scripts)
self.common_words = CaseInsensitiveDict(dict.fromkeys(config.ignored, 0))
self.implicit = CaseInsensitiveDict(config.implicit_languages)
def _find_country(self, value: str):
combinations = to_combinations(to_words(value), self.country_max_words)
for c in combinations:
code = to_sentence(c)
try:
return to_match(c, Country.fromguess(code))
except CountryReverseError:
continue
def _find_script(self, value: str):
combinations = to_combinations(to_words(value), self.script_max_words)
for c in combinations:
code = to_sentence(c)
try:
return to_match(c, Script(self.scripts.get(code, code)))
except ValueError:
continue
def _find_region(self, value: str):
combinations = to_combinations(to_words(value), self.region_max_words)
for c in combinations:
code = to_sentence(c)
try:
return to_match(c, Script(self.regions.get(code, code)))
except ValueError:
continue
def _find_implicit_language(self, combinations: typing.List[typing.List[Match]]):
for c in combinations:
sentence = to_sentence(c)
if sentence in self.implicit:
return to_match(c, Language.fromietf(self.implicit[sentence]))
region = self._find_region(sentence)
if region and region.value.code in self.implicit:
lang = Language.fromietf(self.implicit[region.value.code])
return Match(region.start, region.end, value=lang, input_string=region.input_string)
try:
country = Country.fromguess(sentence)
if country.alpha2 in self.implicit:
lang = Language.fromietf(self.implicit[country.alpha2])
if lang.name.lower() == sentence.lower():
lang = Language.fromname(sentence)
return to_match(c, lang)
except CountryReverseError:
pass
def accept_word(self, string: str):
return string.lower() not in self.common_words and not string.isnumeric()
def find_language(self, value: str, context: Context):
value = blank_release_names(value)
all_words = to_words(value, predicate=self.accept_word)
combinations = to_combinations(all_words, self.language_max_words)
implicit_lang = self._find_implicit_language(combinations)
implicit_accepted = implicit_lang and context.accept(implicit_lang.value)
if implicit_accepted and implicit_lang.value.script and implicit_lang.value.script.code.isnumeric():
return implicit_lang
elif implicit_lang and not implicit_accepted:
value = blank_match(implicit_lang)
all_words = to_words(value, predicate=self.accept_word)
combinations = to_combinations(all_words, self.language_max_words)
for c in combinations:
language_sentence = to_sentence(c)
try:
lang = Language.fromguess(language_sentence)
except LanguageReverseError:
continue
match_lang = to_match(c, lang)
remaining_sentence = blank_match(match_lang)
for combination in to_combinations(to_words(remaining_sentence), self.country_max_words):
sentence = to_sentence(combination)
country = self._find_country(sentence)
if country:
try:
# discard country if value is actually the language name
Language.fromguess(country.raw)
except LanguageReverseError:
lang = Language(lang.alpha3, country=country.value, script=lang.script)
break
region = self._find_region(sentence)
if region:
lang = Language(lang.alpha3, country=lang.country, script=region.value)
break
script = self._find_script(sentence)
if script:
lang = Language(lang.alpha3, country=lang.country, script=script.value)
break
if implicit_accepted and implicit_lang.value.alpha3 == lang.alpha3 and not lang.country and not lang.script:
return implicit_lang
if context.accept(lang):
return to_match(c, lang)
if implicit_accepted:
return implicit_lang
def find(self, value: str, context: Context):
match = self.find_language(value, context)
if match:
return match.start, match.end, {'value': match.value}
def language(config: Config):
rebulk = Rebulk()
rebulk.functional(LanguageFinder(config).find, name='language')
return rebulk

@ -0,0 +1,32 @@
import re
from functools import partial
from rebulk import Rebulk
from rebulk.validators import chars_surround
from trakit.config import Config
from trakit.language import language
from trakit.words import seps
def configure(config: Config):
seps_surround = partial(chars_surround, seps)
others = Rebulk()
others.defaults(ignore_case=True, validator=seps_surround)
others.regex_defaults(flags=re.IGNORECASE,
abbreviations=[(r'-', rf'[{re.escape("".join(seps))}]')],
validator=seps_surround)
for name in ('forced', 'commentary', 'external'):
others.string(name, name=name, value=True)
others.string('sdh', name='hearing_impaired', value=True)
others.string('alternate', name='version', value='alternate')
others.string('descriptive', name='descriptive', value=True)
others.regex('cc', 'closed-captions?', name='closed_caption', value=True)
rebulk = Rebulk()
rebulk.rebulk(language(config))
rebulk.rebulk(others)
return rebulk

@ -0,0 +1,99 @@
import re
import typing
from rebulk.match import Match
seps = frozenset(r' [](){}+*|=-_~#/\\.,;:' + '\uff08\uff09')
suppress_chars = frozenset("'")
release_name_re = re.compile(r'(?P<release>[^\.\s]+(?:\.[^\.\s]+){2,})')
def to_words(value: str,
separators: typing.FrozenSet[str] = seps,
ignore_chars: typing.FrozenSet[str] = suppress_chars,
predicate: typing.Callable[[str], bool] = lambda x: True):
input_string = value
start = 0
i = 0
word = ''
words: typing.List[Match] = []
for c in input_string:
i += 1
if c in ignore_chars:
continue
if c not in separators:
word += c
continue
if not word:
start = i
continue
end = i - 1
if not predicate(value[start:end]):
input_string = blank(input_string, start, end)
else:
words.append(Match(start, i - 1, value=word))
word = ''
start = i
if word:
if not predicate(value[start:]):
input_string = blank(input_string, start, len(input_string))
else:
words.append(Match(start, i, value=word))
for w in words:
w.input_string = input_string
return words
def to_combinations(words: typing.List[Match], max_items: int):
results: typing.List[typing.List[Match]] = []
n_words = len(words)
cur_size = min(max_items, n_words)
start = 0
while cur_size > 0:
end = start + cur_size
if end > n_words:
start = 0
cur_size -= 1
continue
results.append(words[start:end])
start += 1
return results
def to_sentence(combination: typing.List[Match]):
return ' '.join([c.value for c in combination])
def to_match(combination: typing.List[Match], value: typing.Any):
start = combination[0].start
end = combination[-1].end
input_string = combination[0].input_string
return Match(start, end, value=value, input_string=input_string)
def blank(string: str, start: int, end: int):
return string[:start] + ''.ljust(end - start, ' ') + string[end:]
def blank_match(match: Match):
return blank(match.input_string, match.start, match.end)
def blank_release_names(value: str):
result = value
match = release_name_re.search(value)
while match:
result = blank(result, match.start('release'), match.end('release'))
match = release_name_re.search(value, match.end('release'))
return result

@ -17,7 +17,7 @@ ga4mp==2.0.4
guess_language-spirit==0.5.3
guessit==3.5.0
jsonschema==4.17.0
knowit==0.4.0
knowit==0.5.2
peewee==3.15.3
py-pretty==1
pycountry==22.3.5
@ -80,8 +80,9 @@ zipp==3.10.0
markupsafe==2.1.1
# Required-by: knowit
pymediainfo==5.1.0
pymediainfo==6.0.1
pyyaml==6.0
trakit==0.2.1
# Required-by: python-socketio
bidict==0.22.0

Loading…
Cancel
Save