Trying to fix Segmentation fault caused by mediainfo in docker container. #2098

pull/2110/head v1.2.1-beta.9
morpheus65535 1 year ago
parent 7136383098
commit 7455496c4c

@ -1,10 +1,9 @@
"""Know your media files better.""" """Know your media files better."""
__title__ = 'knowit' __title__ = 'knowit'
__version__ = '0.4.0' __version__ = '0.5.2'
__short_version__ = '.'.join(__version__.split('.')[:2]) __short_version__ = '0.5'
__author__ = 'Rato AQ2' __author__ = 'Rato AQ2'
__license__ = 'MIT' __license__ = 'MIT'
__copyright__ = 'Copyright 2016-2021, Rato AQ2'
__url__ = 'https://github.com/ratoaq2/knowit' __url__ = 'https://github.com/ratoaq2/knowit'
#: Video extensions #: Video extensions

@ -169,7 +169,7 @@ def dumps(
return convert(info, context) return convert(info, context)
def main(args: typing.List[str] = None) -> None: def main(args: typing.Optional[typing.List[str]] = None) -> None:
"""Execute main function for entry point.""" """Execute main function for entry point."""
argument_parser = build_argument_parser() argument_parser = build_argument_parser()
args = args or sys.argv[1:] args = args or sys.argv[1:]

@ -65,7 +65,7 @@ def know(
raise KnowitException(debug_info(context=context, exc_info=True)) raise KnowitException(debug_info(context=context, exc_info=True))
def dependencies(context: typing.Mapping = None) -> typing.Mapping: def dependencies(context: typing.Optional[typing.Mapping] = None) -> typing.Mapping:
"""Return all dependencies detected by knowit.""" """Return all dependencies detected by knowit."""
deps = {} deps = {}
try: try:

@ -63,6 +63,17 @@ class Property(Reportable[T]):
# Used to detect duplicated values. e.g.: en / en or High@L4.0 / High@L4.0 or Progressive / Progressive # Used to detect duplicated values. e.g.: en / en or High@L4.0 / High@L4.0 or Progressive / Progressive
self.delimiter = delimiter self.delimiter = delimiter
@classmethod
def _extract_value(cls,
track: typing.Mapping,
name: str,
names: typing.List[str]):
if len(names) == 2:
parent_value = track.get(names[0], track.get(names[0].upper(), {}))
return parent_value.get(names[1], parent_value.get(names[1].upper()))
return track.get(name, track.get(name.upper()))
def extract_value( def extract_value(
self, self,
track: typing.Mapping, track: typing.Mapping,
@ -71,7 +82,7 @@ class Property(Reportable[T]):
"""Extract the property value from a given track.""" """Extract the property value from a given track."""
for name in self.names: for name in self.names:
names = name.split('.') names = name.split('.')
value = track.get(names[0], {}).get(names[1]) if len(names) == 2 else track.get(name) value = self._extract_value(track, name, names)
if value is None: if value is None:
if self.default is None: if self.default is None:
continue continue
@ -216,9 +227,10 @@ class MultiValue(Property):
class Rule(Reportable[T]): class Rule(Reportable[T]):
"""Rule abstract class.""" """Rule abstract class."""
def __init__(self, name: str, override=False, **kwargs): def __init__(self, name: str, private=False, override=False, **kwargs):
"""Initialize the object.""" """Initialize the object."""
super().__init__(name, **kwargs) super().__init__(name, **kwargs)
self.private = private
self.override = override self.override = override
def execute(self, props, pv_props, context: typing.Mapping): def execute(self, props, pv_props, context: typing.Mapping):

@ -455,46 +455,46 @@ profiles:
VideoProfileLevel: VideoProfileLevel:
L1: L1:
default: "1" default: '1'
technical: Level 1 technical: Level 1
L11: L11:
default: "1.1" default: '1.1'
technical: Level 1.1 technical: Level 1.1
L13: L13:
default: "1.3" default: '1.3'
technical: Level 1.3 technical: Level 1.3
L2: L2:
default: "2" default: '2'
technical: Level 2 technical: Level 2
L21: L21:
default: "2.1" default: '2.1'
technical: Level 2.1 technical: Level 2.1
L22: L22:
default: "2.2" default: '2.2'
technical: Level 2.2 technical: Level 2.2
L3: L3:
default: "3" default: '3'
technical: Level 3 technical: Level 3
L31: L31:
default: "3.1" default: '3.1'
technical: Level 3.1 technical: Level 3.1
L32: L32:
default: "3.2" default: '3.2'
technical: Level 3.2 technical: Level 3.2
L4: L4:
default: "4" default: '4'
technical: Level 4 technical: Level 4
L41: L41:
default: "4.1" default: '4.1'
technical: Level 4.1 technical: Level 4.1
L42: L42:
default: "4.2" default: '4.2'
technical: Level 4.2 technical: Level 4.2
L5: L5:
default: "5" default: '5'
technical: Level 5 technical: Level 5
L51: L51:
default: "5.1" default: '5.1'
technical: Level 5.1 technical: Level 5.1
LOW: LOW:
default: Low default: Low

@ -106,11 +106,12 @@ class Ratio(Property[Decimal]):
if (width, height) == ('0', '1'): # identity if (width, height) == ('0', '1'): # identity
return Decimal('1.0') return Decimal('1.0')
result = round_decimal(Decimal(width) / Decimal(height), min_digits=1, max_digits=3) if height:
if self.unit: result = round_decimal(Decimal(width) / Decimal(height), min_digits=1, max_digits=3)
result *= self.unit if self.unit:
result *= self.unit
return result return result
self.report(value, context) self.report(value, context)
return None return None

@ -103,10 +103,7 @@ class Provider:
value = prop.extract_value(track, context) value = prop.extract_value(track, context)
if value is not None: if value is not None:
if not prop.private: which = props if not prop.private else pv_props
which = props
else:
which = pv_props
which[name] = value which[name] = value
for name, rule in self.rules.get(track_type, {}).items(): for name, rule in self.rules.get(track_type, {}).items():
@ -116,8 +113,9 @@ class Provider:
value = rule.execute(props, pv_props, context) value = rule.execute(props, pv_props, context)
if value is not None: if value is not None:
props[name] = value which = props if not rule.private else pv_props
elif name in props and not rule.override: which[name] = value
elif name in props and (not rule.override or props[name] is None):
del props[name] del props[name]
return props return props

@ -26,6 +26,7 @@ from knowit.rules import (
LanguageRule, LanguageRule,
ResolutionRule, ResolutionRule,
) )
from knowit.rules.general import GuessTitleRule
from knowit.serializer import get_json_encoder from knowit.serializer import get_json_encoder
from knowit.units import units from knowit.units import units
from knowit.utils import to_dict from knowit.utils import to_dict
@ -83,17 +84,20 @@ class EnzymeProvider(Provider):
}, },
}, { }, {
'video': { 'video': {
'language': LanguageRule('video language'), 'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('video language', override=True),
'resolution': ResolutionRule('video resolution'), 'resolution': ResolutionRule('video resolution'),
}, },
'audio': { 'audio': {
'language': LanguageRule('audio language'), 'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('audio language', override=True),
'channels': AudioChannelsRule('audio channels'), 'channels': AudioChannelsRule('audio channels'),
}, },
'subtitle': { 'subtitle': {
'language': LanguageRule('subtitle language'), 'guessed': GuessTitleRule('guessed properties', private=True),
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'), 'language': LanguageRule('subtitle language', override=True),
'closed_caption': ClosedCaptionRule('closed caption'), 'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True),
'closed_caption': ClosedCaptionRule('closed caption', override=True),
} }
}) })
@ -130,7 +134,8 @@ class EnzymeProvider(Provider):
if logger.level == logging.DEBUG: if logger.level == logging.DEBUG:
logger.debug('Video {video_path} scanned using Enzyme {version} has raw data:\n{data}', logger.debug('Video {video_path} scanned using Enzyme {version} has raw data:\n{data}',
video_path=video_path, version=enzyme.__version__, data=json.dumps(data)) video_path=video_path, version=enzyme.__version__,
data=json.dumps(data, cls=get_json_encoder(context), indent=4, ensure_ascii=False))
result = self._describe_tracks(video_path, data.get('info', {}), data.get('video_tracks'), result = self._describe_tracks(video_path, data.get('info', {}), data.get('video_tracks'),
data.get('audio_tracks'), data.get('subtitle_tracks'), context) data.get('audio_tracks'), data.get('subtitle_tracks'), context)

@ -34,6 +34,7 @@ from knowit.rules import (
LanguageRule, LanguageRule,
ResolutionRule, ResolutionRule,
) )
from knowit.rules.general import GuessTitleRule
from knowit.serializer import get_json_encoder from knowit.serializer import get_json_encoder
from knowit.units import units from knowit.units import units
from knowit.utils import ( from knowit.utils import (
@ -77,7 +78,7 @@ class FFmpegExecutor:
def extract_info(self, filename): def extract_info(self, filename):
"""Extract media info.""" """Extract media info."""
json_dump = self._execute(filename) json_dump = self._execute(filename)
return json.loads(json_dump) return json.loads(json_dump) if json_dump else {}
def _execute(self, filename): def _execute(self, filename):
raise NotImplementedError raise NotImplementedError
@ -144,7 +145,7 @@ class FFmpegProvider(Provider):
'id': Basic('index', data_type=int, allow_fallback=True, description='video track number'), 'id': Basic('index', data_type=int, allow_fallback=True, description='video track number'),
'name': Property('tags.title', description='video track name'), 'name': Property('tags.title', description='video track name'),
'language': Language('tags.language', description='video language'), 'language': Language('tags.language', description='video language'),
'duration': Duration('duration', description='video duration'), 'duration': Duration('duration', 'tags.duration', description='video duration'),
'width': Quantity('width', unit=units.pixel), 'width': Quantity('width', unit=units.pixel),
'height': Quantity('height', unit=units.pixel), 'height': Quantity('height', unit=units.pixel),
'scan_type': ScanType(config, 'field_order', default='Progressive', description='video scan type'), 'scan_type': ScanType(config, 'field_order', default='Progressive', description='video scan type'),
@ -153,7 +154,7 @@ class FFmpegProvider(Provider):
'resolution': None, # populated with ResolutionRule 'resolution': None, # populated with ResolutionRule
'frame_rate': Ratio('r_frame_rate', unit=units.FPS, description='video frame rate'), 'frame_rate': Ratio('r_frame_rate', unit=units.FPS, description='video frame rate'),
# frame_rate_mode # frame_rate_mode
'bit_rate': Quantity('bit_rate', unit=units.bps, description='video bit rate'), 'bit_rate': Quantity('bit_rate', 'tags.bps', unit=units.bps, description='video bit rate'),
'bit_depth': Quantity('bits_per_raw_sample', unit=units.bit, description='video bit depth'), 'bit_depth': Quantity('bits_per_raw_sample', unit=units.bit, description='video bit depth'),
'codec': VideoCodec(config, 'codec_name', description='video codec'), 'codec': VideoCodec(config, 'codec_name', description='video codec'),
'profile': VideoProfile(config, 'profile', description='video codec profile'), 'profile': VideoProfile(config, 'profile', description='video codec profile'),
@ -166,13 +167,13 @@ class FFmpegProvider(Provider):
'id': Basic('index', data_type=int, allow_fallback=True, description='audio track number'), 'id': Basic('index', data_type=int, allow_fallback=True, description='audio track number'),
'name': Property('tags.title', description='audio track name'), 'name': Property('tags.title', description='audio track name'),
'language': Language('tags.language', description='audio language'), 'language': Language('tags.language', description='audio language'),
'duration': Duration('duration', description='audio duration'), 'duration': Duration('duration', 'tags.duration', description='audio duration'),
'codec': AudioCodec(config, 'profile', 'codec_name', description='audio codec'), 'codec': AudioCodec(config, 'profile', 'codec_name', description='audio codec'),
'profile': AudioProfile(config, 'profile', description='audio codec profile'), 'profile': AudioProfile(config, 'profile', description='audio codec profile'),
'channels_count': AudioChannels('channels', description='audio channels count'), 'channels_count': AudioChannels('channels', description='audio channels count'),
'channels': None, # populated with AudioChannelsRule 'channels': None, # populated with AudioChannelsRule
'bit_depth': Quantity('bits_per_raw_sample', unit=units.bit, description='audio bit depth'), 'bit_depth': Quantity('bits_per_raw_sample', unit=units.bit, description='audio bit depth'),
'bit_rate': Quantity('bit_rate', unit=units.bps, description='audio bit rate'), 'bit_rate': Quantity('bit_rate', 'tags.bps', unit=units.bps, description='audio bit rate'),
'sampling_rate': Quantity('sample_rate', unit=units.Hz, description='audio sampling rate'), 'sampling_rate': Quantity('sample_rate', unit=units.Hz, description='audio sampling rate'),
'forced': YesNo('disposition.forced', hide_value=False, description='audio track forced'), 'forced': YesNo('disposition.forced', hide_value=False, description='audio track forced'),
'default': YesNo('disposition.default', hide_value=False, description='audio track default'), 'default': YesNo('disposition.default', hide_value=False, description='audio track default'),
@ -190,17 +191,20 @@ class FFmpegProvider(Provider):
}, },
}, { }, {
'video': { 'video': {
'language': LanguageRule('video language'), 'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('video language', override=True),
'resolution': ResolutionRule('video resolution'), 'resolution': ResolutionRule('video resolution'),
}, },
'audio': { 'audio': {
'language': LanguageRule('audio language'), 'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('audio language', override=True),
'channels': AudioChannelsRule('audio channels'), 'channels': AudioChannelsRule('audio channels'),
}, },
'subtitle': { 'subtitle': {
'language': LanguageRule('subtitle language'), 'guessed': GuessTitleRule('guessed properties', private=True),
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'), 'language': LanguageRule('subtitle language', override=True),
'closed_caption': ClosedCaptionRule('closed caption'), 'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True),
'closed_caption': ClosedCaptionRule('closed caption', override=True),
}, },
}) })
self.executor = FFmpegExecutor.get_executor_instance(suggested_path) self.executor = FFmpegExecutor.get_executor_instance(suggested_path)

@ -1,5 +1,6 @@
import ctypes
import json import json
import os
import re import re
from ctypes import c_void_p, c_wchar_p from ctypes import c_void_p, c_wchar_p
from decimal import Decimal from decimal import Decimal
@ -43,6 +44,7 @@ from knowit.rules import (
LanguageRule, LanguageRule,
ResolutionRule, ResolutionRule,
) )
from knowit.rules.general import GuessTitleRule
from knowit.units import units from knowit.units import units
from knowit.utils import ( from knowit.utils import (
define_candidate, define_candidate,
@ -77,7 +79,7 @@ class MediaInfoExecutor:
locations = { locations = {
'unix': ('/usr/local/mediainfo/lib', '/usr/local/mediainfo/bin', '__PATH__'), 'unix': ('/usr/local/mediainfo/lib', '/usr/local/mediainfo/bin', '__PATH__'),
'windows': ('__PATH__', ), 'windows': ('C:\\Program Files\\MediaInfo', 'C:\\Program Files (x86)\\MediaInfo', '__PATH__'),
'macos': ('__PATH__', ), 'macos': ('__PATH__', ),
} }
@ -121,12 +123,28 @@ class MediaInfoCliExecutor(MediaInfoExecutor):
} }
def _execute(self, filename): def _execute(self, filename):
return json.loads(check_output([self.location, '--Output=JSON', '--Full', filename]).decode()) data = check_output([self.location, '--Output=JSON', '--Full', filename]).decode()
return json.loads(data) if data else {}
@classmethod
def _is_gui_exe(cls, candidate: str):
if not candidate.endswith('MediaInfo.exe') or not os.path.isfile(candidate):
return False
try:
shell32 = ctypes.WinDLL('shell32', use_last_error=True) # type: ignore
return bool(shell32.ExtractIconExW(candidate, 0, None, None, 1))
except Exception:
return False
@classmethod @classmethod
def create(cls, os_family=None, suggested_path=None): def create(cls, os_family=None, suggested_path=None):
"""Create the executor instance.""" """Create the executor instance."""
for candidate in define_candidate(cls.locations, cls.names, os_family, suggested_path): for candidate in define_candidate(cls.locations, cls.names, os_family, suggested_path):
if cls._is_gui_exe(candidate):
continue
try: try:
output = check_output([candidate, '--version']).decode() output = check_output([candidate, '--version']).decode()
version = cls._get_version(output) version = cls._get_version(output)
@ -154,7 +172,9 @@ class MediaInfoCTypesExecutor(MediaInfoExecutor):
def _execute(self, filename): def _execute(self, filename):
# Create a MediaInfo handle # Create a MediaInfo handle
return json.loads(MediaInfo.parse(filename, library_file=self.location, output='JSON')) data = MediaInfo.parse(filename, library_file=self.location, output='JSON')
return json.loads(data) if data else {}
@classmethod @classmethod
def create(cls, os_family=None, suggested_path=None): def create(cls, os_family=None, suggested_path=None):
@ -254,19 +274,22 @@ class MediaInfoProvider(Provider):
}, },
}, { }, {
'video': { 'video': {
'language': LanguageRule('video language'), 'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('video language', override=True),
'resolution': ResolutionRule('video resolution'), 'resolution': ResolutionRule('video resolution'),
}, },
'audio': { 'audio': {
'language': LanguageRule('audio language'), 'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('audio language', override=True),
'channels': AudioChannelsRule('audio channels'), 'channels': AudioChannelsRule('audio channels'),
'_atmosrule': AtmosRule(config, 'atmos rule'), 'atmos': AtmosRule(config, 'atmos rule', private=True),
'_dtshdrule': DtsHdRule(config, 'dts-hd rule'), 'dtshd': DtsHdRule(config, 'dts-hd rule', private=True),
}, },
'subtitle': { 'subtitle': {
'language': LanguageRule('subtitle language'), 'guessed': GuessTitleRule('guessed properties', private=True),
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'), 'language': LanguageRule('subtitle language', override=True),
'closed_caption': ClosedCaptionRule('closed caption'), 'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True),
'closed_caption': ClosedCaptionRule('closed caption', override=True),
} }
}) })
self.executor = MediaInfoExecutor.get_executor_instance(suggested_path) self.executor = MediaInfoExecutor.get_executor_instance(suggested_path)

@ -28,6 +28,7 @@ from knowit.rules import (
LanguageRule, LanguageRule,
ResolutionRule, ResolutionRule,
) )
from knowit.rules.general import GuessTitleRule
from knowit.serializer import get_json_encoder from knowit.serializer import get_json_encoder
from knowit.units import units from knowit.units import units
from knowit.utils import define_candidate, detect_os from knowit.utils import define_candidate, detect_os
@ -67,7 +68,7 @@ class MkvMergeExecutor:
def extract_info(self, filename): def extract_info(self, filename):
"""Extract media info.""" """Extract media info."""
json_dump = self._execute(filename) json_dump = self._execute(filename)
return json.loads(json_dump) return json.loads(json_dump) if json_dump else {}
def _execute(self, filename): def _execute(self, filename):
raise NotImplementedError raise NotImplementedError
@ -166,17 +167,20 @@ class MkvMergeProvider(Provider):
}, },
}, { }, {
'video': { 'video': {
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('video language', override=True), 'language': LanguageRule('video language', override=True),
'resolution': ResolutionRule('video resolution'), 'resolution': ResolutionRule('video resolution'),
}, },
'audio': { 'audio': {
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('audio language', override=True), 'language': LanguageRule('audio language', override=True),
'channels': AudioChannelsRule('audio channels'), 'channels': AudioChannelsRule('audio channels'),
}, },
'subtitle': { 'subtitle': {
'guessed': GuessTitleRule('guessed properties', private=True),
'language': LanguageRule('subtitle language', override=True), 'language': LanguageRule('subtitle language', override=True),
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'), 'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True),
'closed_caption': ClosedCaptionRule('closed caption'), 'closed_caption': ClosedCaptionRule('closed caption', override=True),
} }
}) })
self.executor = MkvMergeExecutor.get_executor_instance(suggested_path) self.executor = MkvMergeExecutor.get_executor_instance(suggested_path)

@ -1,8 +1,6 @@
import re
from logging import NullHandler, getLogger from logging import NullHandler, getLogger
import babelfish from trakit.api import trakit
from knowit.core import Rule from knowit.core import Rule
@ -10,22 +8,27 @@ logger = getLogger(__name__)
logger.addHandler(NullHandler()) logger.addHandler(NullHandler())
class GuessTitleRule(Rule):
"""Guess properties from track title."""
def execute(self, props, pv_props, context):
"""Language detection using name."""
if 'name' in props:
language = props.get('language')
options = {'expected_language': language} if language else {}
guessed = trakit(props['name'], options)
if guessed:
return guessed
class LanguageRule(Rule): class LanguageRule(Rule):
"""Language rules.""" """Language rules."""
name_re = re.compile(r'(?P<name>\w+)\b', re.IGNORECASE)
def execute(self, props, pv_props, context): def execute(self, props, pv_props, context):
"""Language detection using name.""" """Language detection using name."""
if 'language' in props: if 'guessed' not in pv_props:
return return
if 'name' in props: guess = pv_props['guessed']
name = props.get('name', '') if 'language' in guess:
match = self.name_re.match(name) return guess['language']
if match:
try:
return babelfish.Language.fromname(match.group('name'))
except babelfish.Error:
pass
logger.info('Invalid %s: %r', self.description, name)

@ -10,18 +10,19 @@ class ClosedCaptionRule(Rule):
def execute(self, props, pv_props, context): def execute(self, props, pv_props, context):
"""Execute closed caption rule.""" """Execute closed caption rule."""
for name in (pv_props.get('_closed_caption'), props.get('name')): if '_closed_caption' in pv_props and self.cc_re.search(pv_props['_closed_caption']):
if name and self.cc_re.search(name): return True
return True
if 'guessed' in pv_props:
guessed = pv_props['guessed']
return guessed.get('closed_caption')
class HearingImpairedRule(Rule): class HearingImpairedRule(Rule):
"""Hearing Impaired rule.""" """Hearing Impaired rule."""
hi_re = re.compile(r'(\bsdh\b)', re.IGNORECASE)
def execute(self, props, pv_props, context): def execute(self, props, pv_props, context):
"""Hearing Impaired.""" """Hearing Impaired."""
name = props.get('name') if 'guessed' in pv_props:
if name and self.hi_re.search(name): guessed = pv_props['guessed']
return True return guessed.get('hearing_impaired')

@ -1,10 +1,5 @@
import typing import typing
try:
import pint
except ImportError:
pint = False
class NullRegistry: class NullRegistry:
"""A NullRegistry that masquerades as a pint.UnitRegistry.""" """A NullRegistry that masquerades as a pint.UnitRegistry."""
@ -25,9 +20,18 @@ class NullRegistry:
def _build_unit_registry(): def _build_unit_registry():
registry = pint.UnitRegistry() if pint else NullRegistry() try:
registry.define('FPS = 1 * hertz') import pint
return registry
registry = pint.UnitRegistry()
registry.define('FPS = 1 * hertz')
pint.set_application_registry(registry)
return registry
except ModuleNotFoundError:
pass
return NullRegistry()
units = _build_unit_registry() units = _build_unit_registry()

@ -386,7 +386,7 @@ class MediaInfo:
A higher value will yield more precise results in some cases A higher value will yield more precise results in some cases
but will also increase parsing time. but will also increase parsing time.
:param bool full: display additional tags, including computer-readable values :param bool full: display additional tags, including computer-readable values
for sizes and durations. for sizes and durations, corresponds to the CLI's ``--Full``/``-f`` parameter.
:param bool legacy_stream_display: display additional information about streams. :param bool legacy_stream_display: display additional information about streams.
:param dict mediainfo_options: additional options that will be passed to the :param dict mediainfo_options: additional options that will be passed to the
`MediaInfo_Option` function, for example: ``{"Language": "raw"}``. `MediaInfo_Option` function, for example: ``{"Language": "raw"}``.

@ -0,0 +1,8 @@
__title__ = 'trakit'
__version__ = '0.2.1'
__short_version__ = '0.2'
__author__ = 'RatoAQ'
__license__ = 'MIT'
__url__ = 'https://github.com/ratoaq2/trakit'
from .api import TrakItApi, trakit

@ -0,0 +1,108 @@
import argparse
import json
import logging
import sys
import typing
import babelfish
from trakit import TrakItApi, __version__
logging.basicConfig(stream=sys.stdout, format='%(message)s')
logging.getLogger('CONSOLE').setLevel(logging.INFO)
logging.getLogger('trakit').setLevel(logging.WARNING)
console = logging.getLogger('CONSOLE')
logger = logging.getLogger('trakit')
def build_argument_parser() -> argparse.ArgumentParser:
"""Build the argument parser."""
opts = argparse.ArgumentParser()
opts.add_argument(
dest='value',
help='track title to guess',
type=str,
)
conf_opts = opts.add_argument_group('Configuration')
conf_opts.add_argument(
'-l',
'--expected-language',
dest='expected_language',
help='The expected language to be guessed',
type=str,
)
output_opts = opts.add_argument_group('Output')
output_opts.add_argument(
'--debug',
action='store_true',
dest='debug',
help='Print information for debugging trakit and for reporting bugs.'
)
output_opts.add_argument(
'-y',
'--yaml',
action='store_true',
dest='yaml',
help='Display output in yaml format'
)
information_opts = opts.add_argument_group('Information')
information_opts.add_argument('--version', action='version', version=__version__)
return opts
def _as_yaml(value: str, info: typing.Mapping[str, typing.Any]) -> str:
"""Convert info to string using YAML format."""
import yaml
def default_representer(r: yaml.representer.SafeRepresenter, data: typing.Any):
return r.represent_scalar('tag:yaml.org,2002:str', str(data))
yaml.representer.SafeRepresenter.add_representer(babelfish.Language, default_representer)
return yaml.safe_dump({value: dict(info)}, allow_unicode=True, sort_keys=False)
def _as_json(info: typing.Mapping[str, typing.Any]) -> str:
"""Convert info to string using JSON format."""
return json.dumps(info, ensure_ascii=False, indent=2, default=str)
def dump(value: str, info: typing.Mapping[str, typing.Any], opts: argparse.Namespace) -> str:
"""Convert info to string using json or yaml format."""
if opts.yaml:
return _as_yaml(value, info)
return _as_json(info)
def trakit(value: str, opts: argparse.Namespace) -> typing.Mapping:
"""Extract video metadata."""
if not opts.yaml:
console.info('Parsing: %s', value)
options = {k: v for k, v in vars(opts).items() if v is not None}
info = TrakItApi().trakit(value, options)
console.info('TrakIt %s found: ', __version__)
console.info(dump(value, info, opts))
return info
def main(args: typing.Optional[typing.List[str]] = None):
"""Execute main function for entry point."""
argument_parser = build_argument_parser()
args = args or sys.argv[1:]
opts = argument_parser.parse_args(args)
if opts.debug:
logger.setLevel(logging.DEBUG)
logging.getLogger('rebulk').setLevel(logging.DEBUG)
return trakit(opts.value, opts)
if __name__ == '__main__':
main(sys.argv[1:])

@ -0,0 +1,24 @@
import typing
from trakit.config import Config
from trakit.context import Context
from trakit.patterns import configure
class TrakItApi:
def __init__(self, config: typing.Optional[typing.Mapping[str, typing.Any]] = None):
self.rebulk = configure(Config(config))
def trakit(self, string: str, options: typing.Optional[typing.Mapping[str, typing.Any]] = None):
"""Return a mapping of extracted information."""
matches = self.rebulk.matches(string, Context(options))
guess: typing.Mapping[str, typing.Any] = matches.to_dict()
return guess
default_api = TrakItApi()
def trakit(string: str, options: typing.Optional[typing.Mapping[str, typing.Any]] = None):
return default_api.trakit(string, options)

@ -0,0 +1,19 @@
import json
import typing
from pkg_resources import resource_stream
class Config:
def __init__(self, config: typing.Optional[typing.Mapping[str, typing.Any]]):
with resource_stream('trakit', 'data/config.json') as f:
cfg: typing.Dict[str, typing.Any] = json.load(f)
if config:
cfg.update(config)
self.ignored: typing.Set[str] = set(cfg.get('ignored', []))
self.countries: typing.Mapping[str, str] = cfg.get('countries', {})
self.languages: typing.Mapping[str, str] = cfg.get('languages', {})
self.scripts: typing.Mapping[str, str] = cfg.get('scripts', {})
self.regions: typing.Mapping[str, str] = cfg.get('regions', {})
self.implicit_languages: typing.Mapping[str, str] = cfg.get('implicit-languages', {})

@ -0,0 +1,22 @@
import typing
import babelfish
class Context(dict):
def __init__(self, options: typing.Optional[typing.Mapping[str, typing.Any]] = None):
super().__init__(options or {})
language = self['expected_language'] if 'expected_language' in self else None
if language and not isinstance(language, babelfish.Language):
language = babelfish.Language.fromietf(str(language))
self.expected_language: typing.Optional[babelfish.Language] = language
def accept(self, lang: babelfish.Language):
if self.expected_language is None:
return True
if self.expected_language.alpha3 != lang.alpha3:
return False
if self.expected_language.script and self.expected_language != lang.script:
return False
return not self.expected_language.country or self.expected_language == lang.country

@ -0,0 +1,32 @@
import typing
from babelfish import Country, CountryReverseConverter, CountryReverseError
from babelfish.converters import CaseInsensitiveDict
class GuessCountryConverter(CountryReverseConverter):
def __init__(self, config: typing.Mapping[str, str]):
self.synonyms = CaseInsensitiveDict(config)
def convert(self, alpha2):
return str(Country(alpha2))
def reverse(self, name: str):
try:
return self.synonyms[name]
except KeyError:
pass
if name.isupper() and len(name) == 2:
try:
return Country(name).alpha2
except ValueError:
pass
for conv in (Country.fromname,):
try:
return conv(name).alpha2
except CountryReverseError:
pass
raise CountryReverseError(name)

@ -0,0 +1,30 @@
import typing
from babelfish import Language, LanguageReverseConverter, LanguageReverseError
from babelfish.converters import CaseInsensitiveDict
class GuessLanguageConverter(LanguageReverseConverter):
def __init__(self, config: typing.Mapping[str, str]):
self.synonyms = CaseInsensitiveDict()
for synonym, code in config.items():
lang = Language.fromietf(code) if '-' in code else Language(code)
self.synonyms[synonym] = (lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script)
def convert(self, alpha3: str, country=None, script=None):
return str(Language(alpha3, country, script))
def reverse(self, name: str):
try:
return self.synonyms[name]
except KeyError:
pass
for conv in (Language.fromname,):
try:
reverse = conv(name)
return reverse.alpha3, reverse.country, reverse.script
except (ValueError, LanguageReverseError):
pass
raise LanguageReverseError(name)

@ -0,0 +1,860 @@
{
"countries": {
"Afghan": "AF",
"Aforika Borwa": "ZA",
"Afrika Borwa": "ZA",
"Afrika Dzonga": "ZA",
"Afurika Tshipembe": "ZA",
"Aland": "AX",
"Alandish": "AX",
"Albanian": "AL",
"Algerian": "DZ",
"American": "US",
"American Islander": "UM",
"American Samoan": "AS",
"American Virgin Islander": "VI",
"Andorran": "AD",
"Angolan": "AO",
"Anguillian": "AI",
"Antarctican": "AQ",
"Antiguan Barbudan": "AG",
"Ao Men": "MO",
"Aotearoa": "NZ",
"Argentine": "AR",
"Armenian": "AM",
"Aruban": "AW",
"Australian": "AU",
"Austrian": "AT",
"Ayiti": "HT",
"Azerbaidzhan": "AZ",
"Azerbaijani": "AZ",
"Azərbaycan": "AZ",
"Bahamian": "BS",
"Bahraini": "BH",
"Bangladeshi": "BD",
"Barbadian": "BB",
"Beafrika": "CF",
"Belarusian": "BY",
"Belau": "PW",
"Belgian": "BE",
"Belgie": "BE",
"Belgien": "BE",
"Belgique": "BE",
"België": "BE",
"Belice": "BZ",
"Belizean": "BZ",
"Beninese": "BJ",
"Bermudian": "BM",
"Bhutanese": "BT",
"Blgariia": "BG",
"Bolivia": "BO",
"Bolivian": "BO",
"Boneiru Sint Eustatius y Saba": "BQ",
"Bosna i Hercegovina": "BA",
"Bosna i Khertsegovina": "BA",
"Bosnian Herzegovinian": "BA",
"Bouvetoya": "BV",
"Bouvetøya": "BV",
"Brasil": "BR",
"Brazilian": "BR",
"British": "GB",
"British Virgin Islander": "VG",
"British Virgin Islands": "VG",
"Bruneian": "BN",
"Bulgarian": "BG",
"Buliwya": "BO",
"Burkinabe": "BF",
"Burmese": "MM",
"Burundian": "BI",
"Bénin": "BJ",
"Bêafrîka": "CF",
"Cabo Verde": "CV",
"Cambodian": "KH",
"Cameroonian": "CM",
"Cameroun": "CM",
"Canadian": "CA",
"Cape Verdian": "CV",
"Caribisch Nederland": "BQ",
"Caymanian": "KY",
"Central African": "CF",
"Cesko": "CZ",
"Chadian": "TD",
"Channel Islander": "JE",
"Chilean": "CL",
"Chinese": "CN",
"Christmas Islander": "CX",
"Cocos Islander": "CC",
"Cocos Keeling Islands": "CC",
"Colombian": "CO",
"Comoran": "KM",
"Comores": "KM",
"Congolese": "CD",
"Cook Islander": "CK",
"Costa Rican": "CR",
"Cote dIvoire": "CI",
"Croatian": "HR",
"Cuban": "CU",
"Curacao": "CW",
"Curacaoan": "CW",
"Curaçaoan": "CW",
"Cypriot": "CY",
"Czech": "CZ",
"Côte dIvoire": "CI",
"Danish": "DK",
"Danmark": "DK",
"Deutschland": "DE",
"Dgernesiais": "GG",
"Dgèrnésiais": "GG",
"Ditunga dia Kongu wa Mungalaata": "CD",
"Dominican": "DO",
"Dutch": "NL",
"East Timorese": "TL",
"Ecuadorean": "EC",
"Eesti": "EE",
"Egyptian": "EG",
"Eire": "IE",
"Ellada": "GR",
"Emirati": "AE",
"Equatorial Guinean": "GQ",
"Eritrean": "ER",
"Espana": "ES",
"España": "ES",
"Estados Unidos": "US",
"Estonian": "EE",
"Eswatini": "SZ",
"Ethiopian": "ET",
"Faereyjar": "FO",
"Faeroerne": "FO",
"Falkland Islander": "FK",
"Falkland Islands": "FK",
"Faroese": "FO",
"Fijian": "FJ",
"Filipino": "PH",
"Finnish": "FI",
"Foroyar": "FO",
"French": "FR",
"French Polynesian": "PF",
"Færeyjar": "FO",
"Færøerne": "FO",
"Føroyar": "FO",
"Gabonese": "GA",
"Gambian": "GM",
"Georgian": "GE",
"German": "DE",
"Ghanaian": "GH",
"Greek": "GR",
"Greenlandic": "GL",
"Grenadian": "GD",
"Guadeloupian": "GP",
"Guahan": "GU",
"Guamanian": "GU",
"Guatemalan": "GT",
"Guernesey": "GG",
"Guianan": "GF",
"Guine Bissau": "GW",
"Guine Equatorial": "GQ",
"Guinea Bissauan": "GW",
"Guinea Ecuatorial": "GQ",
"Guinean": "GN",
"Guinee": "GN",
"Guinee equatoriale": "GQ",
"Guiné Bissau": "GW",
"Guiné Equatorial": "GQ",
"Guinée": "GN",
"Guinée équatoriale": "GQ",
"Guyane francaise": "GF",
"Guyane française": "GF",
"Guyanese": "GY",
"Guåhån": "GU",
"Haitian": "HT",
"Hayastan": "AM",
"Haïti": "HT",
"Heard and McDonald Islander": "HM",
"Honduran": "HN",
"Hong Konger": "HK",
"Hrvatska": "HR",
"Hungarian": "HU",
"I Kiribati": "KI",
"Icelander": "IS",
"Indian": "IN",
"Indonesian": "ID",
"Iranian": "IR",
"Iraqi": "IQ",
"Irish": "IE",
"Island": "IS",
"Israeli": "IL",
"Italia": "IT",
"Italian": "IT",
"Ivorian": "CI",
"Jamaican": "JM",
"Jamhuri ya Kidemokrasia ya Kongo": "CD",
"Japanese": "JP",
"Jerri": "JE",
"Jordanian": "JO",
"Jèrri": "JE",
"Kalaallit Nunaat": "GL",
"Kampuchea": "KH",
"Kazakhstani": "KZ",
"Kazakstan": "KZ",
"Kenyan": "KE",
"Kibris": "CY",
"Kirghiz": "KG",
"Kirgiziia": "KG",
"Kittitian or Nevisian": "KN",
"Komori": "KM",
"Kuki Airani": "CK",
"Kupros": "CY",
"Kuwaiti": "KW",
"Kâmpŭchéa": "KH",
"Kıbrıs": "CY",
"Kūki Āirani": "CK",
"La Reunion": "RE",
"La Réunion": "RE",
"Laotian": "LA",
"Latvian": "LV",
"Latvija": "LV",
"Lebanese": "LB",
"Letzebuerg": "LU",
"Liban": "LB",
"Liberian": "LR",
"Libyan": "LY",
"Liechtensteiner": "LI",
"Lietuva": "LT",
"Lithuanian": "LT",
"Luxembourger": "LU",
"Luxemburg": "LU",
"Lëtzebuerg": "LU",
"Macanese": "MO",
"Macau": "MO",
"Macedonian": "MK",
"Madagasikara": "MG",
"Magyarorszag": "HU",
"Magyarország": "HU",
"Mahoran": "YT",
"Majel": "MH",
"Makedonija": "MK",
"Makedonski": "MK",
"Malagasy": "MG",
"Malawian": "MW",
"Malaysian": "MY",
"Malaŵi": "MW",
"Maldivan": "MV",
"Malian": "ML",
"Maltese": "MT",
"Mannin": "IM",
"Manx": "IM",
"Marshallese": "MH",
"Martinican": "MQ",
"Maurice": "MU",
"Mauritanian": "MR",
"Mauritian": "MU",
"Mexican": "MX",
"Micronesia": "FM",
"Micronesian": "FM",
"Mocambique": "MZ",
"Moldova": "MD",
"Moldovan": "MD",
"Monegasque": "MC",
"Mongol uls": "MN",
"Mongolian": "MN",
"Montenegrin": "ME",
"Montserratian": "MS",
"Moris": "MU",
"Moroccan": "MA",
"Mosotho": "LS",
"Motswana": "BW",
"Mozambican": "MZ",
"Moçambique": "MZ",
"Mzantsi Afrika": "ZA",
"México": "MX",
"M̧ajeļ": "MH",
"Na Islas Marianas": "MP",
"Na Islas Mariånas": "MP",
"Namibian": "NA",
"Namibie": "NA",
"Namibië": "NA",
"Nauruan": "NR",
"Nederland": "NL",
"Negara Brunei Darussalam": "BN",
"Nepalese": "NP",
"New Caledonian": "NC",
"New Zealander": "NZ",
"Ni Vanuatu": "VU",
"Nicaraguan": "NI",
"Nigerian": "NG",
"Nigerien": "NE",
"Ningizimu Afrika": "ZA",
"Niuean": "NU",
"Niuē": "NU",
"Noreg": "NO",
"Norfk Ailen": "NF",
"Norfolk Islander": "NF",
"Norge": "NO",
"Norgga": "NO",
"North Korean": "KP",
"Norwegian": "NO",
"Nouvelle Caledonie": "NC",
"Nouvelle Calédonie": "NC",
"Omani": "OM",
"Osterreich": "AT",
"Owganystan": "AF",
"Ozbekiston": "UZ",
"Ozbekiston": "UZ",
"Pais Korsou": "CW",
"Pais Kòrsou": "CW",
"Pakistani": "PK",
"Palauan": "PW",
"Palestinian": "PS",
"Panamanian": "PA",
"Panamá": "PA",
"Papua New Guinean": "PG",
"Papua Niu Gini": "PG",
"Papua Niugini": "PG",
"Paraguai": "PY",
"Paraguayan": "PY",
"Paraguái": "PY",
"Peruvian": "PE",
"Perú": "PE",
"Pilipinas": "PH",
"Piruw": "PE",
"Pitcairn Islander": "PN",
"Pitcairn Islands": "PN",
"Polish": "PL",
"Polska": "PL",
"Polynesie francaise": "PF",
"Polynésie française": "PF",
"Portuguese": "PT",
"Puerto Rican": "PR",
"Qatari": "QA",
"RD Congo": "CD",
"Repubilika ya Kongo": "CG",
"Repubilika ya Kongo Demokratiki": "CD",
"Republica Dominicana": "DO",
"Republiki ya Kongo": "CG",
"Republiki ya Kongo Demokratiki": "CD",
"Republiki ya Kongó Demokratiki": "CD",
"Republique centrafricaine": "CF",
"Republique du Congo": "CG",
"Republíki ya Kongó": "CG",
"República Dominicana": "DO",
"Reunionese": "RE",
"Ri Ben": "JP",
"Romanian": "RO",
"România": "RO",
"Rossiia": "RU",
"Russian": "RU",
"Rwandan": "RW",
"République centrafricaine": "CF",
"République du Congo": "CG",
"Réunionese": "RE",
"Sahara Occidental": "EH",
"Sahrawi": "EH",
"Saint Barthelemy": "BL",
"Saint Barthelemy Islander": "BL",
"Saint Barthélemy Islander": "BL",
"Saint Helena Ascension and Tristan da Cunha": "SH",
"Saint Helenian": "SH",
"Saint Lucian": "LC",
"Saint Martin": "MF",
"Saint Martin Islander": "MF",
"Saint Pierrais Miquelonnais": "PM",
"Saint Pierre et Miquelon": "PM",
"Saint Vincentian": "VC",
"Salvadoran": "SV",
"Sammarinese": "SM",
"Samoa Amelika": "AS",
"Samoan": "WS",
"Sao Tome e Principe": "ST",
"Sao Tomean": "ST",
"Saudi Arabian": "SA",
"Schweiz": "CH",
"Senegalese": "SN",
"Serbian": "RS",
"Sesel": "SC",
"Sewula Afrika": "ZA",
"Seychellois": "SC",
"Shqiperia": "AL",
"Shqipëria": "AL",
"Sierra Leonean": "SL",
"Singaporean": "SG",
"Singapura": "SG",
"Sint Maarten": "SX",
"Slovak": "SK",
"Slovene": "SI",
"Slovenija": "SI",
"Slovensko": "SK",
"Solomon Islander": "SB",
"Somali": "SO",
"Soomaaliya": "SO",
"South African": "ZA",
"South Georgia": "GS",
"South Georgian South Sandwich Islander": "GS",
"South Korean": "KR",
"South Sudanese": "SS",
"Spanish": "ES",
"Srbija": "RS",
"Sri Lankan": "LK",
"St Maartener": "SX",
"Sudanese": "SD",
"Suisse": "CH",
"Suomi": "FI",
"Surinamer": "SR",
"Svalbard og Jan Mayen": "SJ",
"Sverige": "SE",
"Svizra": "CH",
"Svizzera": "CH",
"Swazi": "SZ",
"Swedish": "SE",
"Swiss": "CH",
"Syrian": "SY",
"São Tomé e Príncipe": "ST",
"Sénégal": "SN",
"Sāmoa": "WS",
"Sāmoa Amelika": "AS",
"Tadzhik": "TJ",
"Tadzhikistan": "TJ",
"Tai Wan": "TW",
"Taiwanese": "TW",
"Tanzania": "TZ",
"Tanzanian": "TZ",
"Tchad": "TD",
"Terres australes et antarctiques francaises": "TF",
"Terres australes et antarctiques françaises": "TF",
"Thai": "TH",
"Timor Leste": "TL",
"Timór Leste": "TL",
"Tochikiston": "TJ",
"Togolese": "TG",
"Tokelauan": "TK",
"Tongan": "TO",
"Trinidadian": "TT",
"Tsrna Gora": "ME",
"Tunisian": "TN",
"Turkish": "TR",
"Turkiye": "TR",
"Turkmen": "TM",
"Turkmeniia": "TM",
"Turks and Caicos Islander": "TC",
"Tuvaluan": "TV",
"Türkiye": "TR",
"Türkmenistan": "TM",
"UK": "GB",
"US": "US",
"Uburundi": "BI",
"Ugandan": "UG",
"Ukrainian": "UA",
"Ukrayina": "UA",
"United States Virgin Islands": "VI",
"Uruguayan": "UY",
"Uzbekistani": "UZ",
"Vatican": "VA",
"Vaticanae": "VA",
"Vaticano": "VA",
"Vaticanæ": "VA",
"Venezuela": "VE",
"Venezuelan": "VE",
"Vietnam": "VN",
"Vietnamese": "VN",
"Viti": "FJ",
"Việt Nam": "VN",
"Volivia": "BO",
"Volívia": "BO",
"Wallis and Futuna Islander": "WF",
"Wallis et Futuna": "WF",
"Wuliwya": "BO",
"Xiang Gang": "HK",
"Xin Jia Po": "SG",
"Yemeni": "YE",
"Zambian": "ZM",
"Zhong Guo": "CN",
"Zhong Guo Da Lu": "CN",
"Zimbabwean": "ZW",
"`mn": "OM",
"baaNlaadesh": "BD",
"bbaart nuuN": "IN",
"bhaart": "IN",
"brug-yul-": "BT",
"canadien": "CA",
"cingkppuur": "SG",
"dhivehiraajeyge": "MV",
"eSwatini": "SZ",
"eereteraa": "ER",
"fGnstn": "AF",
"flsTyn": "PS",
"hangug": "KR",
"ilngkai": "LK",
"intiyaa": "IN",
"joseon": "KP",
"jybwty": "DJ",
"khoemry": "IQ",
"lSwml": "SO",
"l`rq": "IQ",
"lbHryn": "BH",
"lbnn": "LB",
"ljzyr": "DZ",
"lkwyt": "KW",
"lmGrb": "MA",
"lqmr": "KM",
"lrdn": "JO",
"lswdn": "SD",
"lyaman": "YE",
"lyby": "LY",
"mSr": "EG",
"mlysy": "MY",
"mnmaa": "MM",
"mwrytny": "MR",
"nepaal": "NP",
"phijii": "FJ",
"pkstn": "PK",
"praethsaithy": "TH",
"qTr": "QA",
"qwutnA": "IQ",
"rtry": "ER",
"sak`art`velo": "GE",
"shrii lNkaav": "LK",
"spplaaw": "LA",
"sryyl": "IL",
"swry": "SY",
"teyopheyaa": "ET",
"tshd": "TD",
"twns": "TN",
"ySHrAl": "IL",
"yrn": "IR",
"Åland": "AX",
"Ålandish": "AX",
"Éire": "IE",
"Ísland": "IS",
"Österreich": "AT",
"Česko": "CZ",
"Ελλάδα": "GR",
"Κύπρος": "CY",
"Азербайджан": "AZ",
"Белару́сь": "BY",
"Беларусь": "BY",
оснa и Херцеговина": "BA",
"България": "BG",
"Казахстан": "KZ",
"Киргизия": "KG",
"Кыргызстан": "KG",
"Македонија": "MK",
"Македонски": "MK",
"Монгол улс": "MN",
"Россия": "RU",
"Србија": "RS",
"Таджикистан": "TJ",
"Тоҷикистон": "TJ",
"Туркмения": "TM",
"Узбекистан": "UZ",
"Україна": "UA",
"Црна Гора": "ME",
"Қазақстан": "KZ",
"Հայաստան": "AM",
"ישראל": "IL",
"إرتريا‎": "ER",
"إسرائيل": "IL",
"افغانستان": "AF",
"الأردن": "JO",
"البحرين": "BH",
"الجزائر": "DZ",
"السعودية": "SA",
"السودان": "SD",
"الصحراء الغربية": "EH",
"الصومال‎‎": "SO",
"العراق": "IQ",
"العربية السعودية": "SA",
"القمر‎": "KM",
"الكويت": "KW",
"المغرب": "MA",
"اليَمَن": "YE",
"ایران": "IR",
"تشاد‎": "TD",
"تونس": "TN",
"جيبوتي‎": "DJ",
"دولة الإمارات العربية المتحدة": "AE",
"سوريا": "SY",
"عمان": "OM",
"فلسطين": "PS",
"قطر": "QA",
"لبنان": "LB",
"ليبيا": "LY",
"مصر": "EG",
"مليسيا": "MY",
"موريتانيا": "MR",
"پاكستان": "PK",
"کۆماری": "IQ",
"ܩܘܼܛܢܵܐ": "IQ",
"ދިވެހިރާއްޖޭގެ": "MV",
"नेपाल": "NP",
"फिजी": "FJ",
"भारत": "IN",
"বাংলাদেশ": "BD",
"ভারত": "IN",
"ਭਾਰਤ ਨੂੰ": "IN",
"இந்தியா": "IN",
"இலங்கை": "LK",
"சிங்கப்பூர்": "SG",
"ශ්‍රී ලංකාව": "LK",
"ประเทศไทย": "TH",
"ສປປລາວ": "LA",
"འབྲུག་ཡུལ་": "BT",
"မြန်မာ": "MM",
"საქართველო": "GE",
"ኢትዮጵያ": "ET",
"ኤርትራ": "ER",
"ⵍⵎⴰⵖⵔⵉⴱ": "MA",
"中国": "CN",
"中国大陆": "CN",
"台灣": "TW",
"新加坡": "SG",
"日本": "JP",
"澳门": "MO",
"香港": "HK",
"조선": "KP",
"한국": "KR"
},
"ignored": [
"bit",
"cc",
"ch",
"dan",
"day",
"gun",
"hr",
"jordan",
"la",
"ma",
"na",
"the",
"to"
],
"implicit-languages": {
"419": "es-419",
"BR": "pt-BR",
"CA": "fr-CA",
"Cantonese": "zh",
"Castilian": "es",
"FR": "fr-FR",
"GR": "ell",
"HK": "zh-HK",
"ID": "id-ID",
"Mandarin": "zh",
"Parisian": "fr-FR",
"Simplified": "zh-Hans",
"Traditional": "zh-Hant",
"UA": "uk-UA",
"UK": "en-GB",
"US": "en-US",
"VFF": "fr-FR",
"VFQ": "fr-CA",
"VN": "vie",
"cant": "zh",
"eng": "en",
"ita": "it",
"简体双语": "zh-Hans",
"繁体双语": "zh-Hant"
},
"languages": {
"Adygebze": "ady",
"Avanee": "grn",
"Avañeẽ": "grn",
"Aymar aru": "aym",
"Azərbaycan dili": "aze",
"Bahasa Indonesia": "ind",
"Bahasa Melayu": "msa",
"Basa Jawa": "jav",
"Basa Sunda": "sun",
"Belaruskaia": "bel",
"Blgarski": "bul",
"Bosanski": "bos",
"Brezhoneg": "bre",
"Catala": "cat",
"Català": "cat",
"Cestina": "ces",
"Cymraeg": "cym",
"Dansk": "dan",
"Davvisamegiella": "sme",
"Davvisámegiella": "sme",
"Deutsch": "deu",
"Dolnoserbscina": "dsb",
"Dolnoserbšćina": "dsb",
"Eesti": "est",
"Ellenika": "ell",
"Espanol": "spa",
"Espanol Latinoamerica": "es-419",
"Español": "spa",
"Español Latinoamérica": "es-419",
"Euskara": "eus",
"Foroyskt": "fao",
"Francais": "fra",
"Français": "fra",
"Frysk": "fry",
"Føroyskt": "fao",
"Gaeilge": "gle",
"Gaelg": "glv",
"Gaidhlig": "gla",
"Galego": "glg",
"Greek": "ell",
"Guang Dong Hua ": "zho",
"Gàidhlig": "gla",
"Hayeren": "hye",
"Hornjoserbscina": "hsb",
"Hornjoserbšćina": "hsb",
"Hrvatski": "hrv",
"Islenska": "isl",
"Italiano": "ita",
"Kazaksha": "kaz",
"Kernewek": "cor",
"Kiswahili": "swa",
"Kreyol": "hat",
"Kreyòl": "hat",
"Kurdi": "kur",
"Kurdî": "kur",
"Latviesu": "lav",
"Latviešu": "lav",
"Lemborgs": "lim",
"Letzebuergesch": "ltz",
"Lietuviu": "lit",
"Lietuvių": "lit",
"Lwo": "ach",
"Lèmbörgs": "lim",
"Lëtzebuergesch": "ltz",
"Magyar": "hun",
"Makedonski": "mkd",
"Malay": "msa",
"Malti": "mlt",
"Maya Kaqchikel": "cak",
"Melayu": "msa",
"Mongol": "mon",
"Nederlands": "nld",
"Norsk": "nor",
"Norsk bokmal": "nob",
"Norsk bokmål": "nob",
"Norsk nynorsk": "nno",
"Occitan": "oci",
"Ozbek": "uzb",
"Polski": "pol",
"Portugues": "por",
"Português": "por",
"Qhichwa": "que",
"Ri Ben Yu": "jpn",
"Romana": "ron",
"Română": "ron",
"Rumantsch": "roh",
"Russkii": "rus",
"Shqip": "sqi",
"Slovencina": "slk",
"Slovenscina": "slv",
"Slovenčina": "slk",
"Slovenščina": "slv",
"Soomaaliga": "som",
"Srpski": "srp",
"Suomi": "fin",
"Svenska": "swe",
"Taqbaylit": "kab",
"TcYi": "aka",
"Tieng Viet": "vie",
"Tiếng Việt": "vie",
"Turkce": "tur",
"Türkçe": "tur",
"Tɕɥi": "aka",
"Ukrayinska": "ukr",
"Zhong Wen": "zho",
"Zhong Wen Fan Ti": "zh-Hant",
"Zhong Wen Jian Ti": "zh-Hans",
"`bryt": "heb",
"aithy": "tha",
"baaNlaa": "ben",
"bhaasaakhmaer": "khm",
"bmaackaa": "mya",
"eesti keel": "est",
"frsy": "fas",
"gujraatii": "guj",
"hangugeo": "kor",
"hindii": "hin",
"isiXhosa": "xho",
"isiZulu": "zul",
"k`art`uli": "kat",
"knndd": "kan",
"maithilii maithilii": "mai",
"mlyaallN": "mal",
"mraatthii": "mar",
"nepaalii": "nep",
"oddiaa": "ori",
"pNjaabii": "pan",
"pStw": "pus",
"phaasaaaithy": "tha",
"rdw": "urd",
"sNskRtm": "san",
"siNhl": "sin",
"srpskokhrvatski": "hbs",
"tatarcha": "tat",
"telugu": "tel",
"tlhIngan Hol": "tlh",
"tmilll": "tam",
"tochiki": "tgk",
"yyidySH": "yid",
"zaboni tochiki": "tgk",
"Íslenska": "isl",
"Čeština": "ces",
"Ελληνικά": "ell",
"Адыгэбзэ": "ady",
"Беларуская": "bel",
"Български": "bul",
"Македонски": "mkd",
"Монгол": "mon",
"Русский": "rus",
"Српски": "srp",
"Українська": "ukr",
"забо́ни тоҷикӣ́": "tgk",
"српскохрватски": "hbs",
"татарча": "tat",
"тоҷикӣ": "tgk",
"Қазақша": "kaz",
"Հայերեն": "hye",
"ייִדיש": "yid",
"עברית": "heb",
"اردو": "urd",
"العربية": "ara",
"فارسی": "fas",
"پښتو": "pus",
"नेपाली": "nep",
"मराठी": "mar",
"मैथिली মৈথিলী": "mai",
"संस्कृतम्": "san",
"हिन्दी": "hin",
"বাংলা": "ben",
"ਪੰਜਾਬੀ": "pan",
"ગુજરાતી": "guj",
"ଓଡ଼ିଆ": "ori",
"தமிழ்": "tam",
"తెలుగు": "tel",
"ಕನ್ನಡ": "kan",
"മലയാളം": "mal",
"සිංහල": "sin",
"ภาษาไทย": "tha",
"ไทย": "tha",
"ဗမာစကာ": "mya",
"ქართული": "kat",
"ភាសាខ្មែរ": "khm",
"中文": "zho",
"中文简体": "zh-Hans",
"中文繁體": "zh-Hant",
"廣東話": "zho",
"日本語": "jpn",
"한국어": "kor"
},
"regions": {
"Latin": "419",
"Latinoamerica": "419",
"Latinoamericano": "419",
"Latinoamérica": "419"
},
"scripts": {
"Fan Ti ": "Hant",
"Jian Ti ": "Hans",
"Simplified": "Hans",
"Traditional": "Hant",
"简体": "Hans",
"繁體": "Hant"
}
}

@ -0,0 +1,169 @@
import typing
from babelfish import (
COUNTRIES,
Country,
CountryReverseError,
LANGUAGE_MATRIX,
Language,
LanguageReverseError,
SCRIPTS,
Script,
country_converters,
language_converters
)
from babelfish.converters import CaseInsensitiveDict
from rebulk import Rebulk
from rebulk.match import Match
from trakit.config import Config
from trakit.context import Context
from trakit.converters.country import GuessCountryConverter
from trakit.converters.language import GuessLanguageConverter
from trakit.words import blank_match, blank_release_names, to_combinations, to_match, to_sentence, to_words
class LanguageFinder:
def __init__(self, config: Config):
self.country_max_words = 1
for k, v in COUNTRIES.items():
self.country_max_words = max(self.country_max_words, v.count(' '))
self.language_max_words = 1
for v in LANGUAGE_MATRIX:
self.language_max_words = max(self.language_max_words, v.name.count(' '))
self.script_max_words = 1
for v in config.scripts.keys():
self.script_max_words = max(self.script_max_words, v.count(' '))
self.region_max_words = 1
for v in config.regions.keys():
self.region_max_words = max(self.region_max_words, v.count(' '))
SCRIPTS['419'] = 'Latin America and the Caribbean' # Until babelfish support UN.M49
country_converters['guess'] = GuessCountryConverter(config.countries)
language_converters['guess'] = GuessLanguageConverter(config.languages)
self.regions = CaseInsensitiveDict(config.regions)
self.scripts = CaseInsensitiveDict(config.scripts)
self.common_words = CaseInsensitiveDict(dict.fromkeys(config.ignored, 0))
self.implicit = CaseInsensitiveDict(config.implicit_languages)
def _find_country(self, value: str):
combinations = to_combinations(to_words(value), self.country_max_words)
for c in combinations:
code = to_sentence(c)
try:
return to_match(c, Country.fromguess(code))
except CountryReverseError:
continue
def _find_script(self, value: str):
combinations = to_combinations(to_words(value), self.script_max_words)
for c in combinations:
code = to_sentence(c)
try:
return to_match(c, Script(self.scripts.get(code, code)))
except ValueError:
continue
def _find_region(self, value: str):
combinations = to_combinations(to_words(value), self.region_max_words)
for c in combinations:
code = to_sentence(c)
try:
return to_match(c, Script(self.regions.get(code, code)))
except ValueError:
continue
def _find_implicit_language(self, combinations: typing.List[typing.List[Match]]):
for c in combinations:
sentence = to_sentence(c)
if sentence in self.implicit:
return to_match(c, Language.fromietf(self.implicit[sentence]))
region = self._find_region(sentence)
if region and region.value.code in self.implicit:
lang = Language.fromietf(self.implicit[region.value.code])
return Match(region.start, region.end, value=lang, input_string=region.input_string)
try:
country = Country.fromguess(sentence)
if country.alpha2 in self.implicit:
lang = Language.fromietf(self.implicit[country.alpha2])
if lang.name.lower() == sentence.lower():
lang = Language.fromname(sentence)
return to_match(c, lang)
except CountryReverseError:
pass
def accept_word(self, string: str):
return string.lower() not in self.common_words and not string.isnumeric()
def find_language(self, value: str, context: Context):
value = blank_release_names(value)
all_words = to_words(value, predicate=self.accept_word)
combinations = to_combinations(all_words, self.language_max_words)
implicit_lang = self._find_implicit_language(combinations)
implicit_accepted = implicit_lang and context.accept(implicit_lang.value)
if implicit_accepted and implicit_lang.value.script and implicit_lang.value.script.code.isnumeric():
return implicit_lang
elif implicit_lang and not implicit_accepted:
value = blank_match(implicit_lang)
all_words = to_words(value, predicate=self.accept_word)
combinations = to_combinations(all_words, self.language_max_words)
for c in combinations:
language_sentence = to_sentence(c)
try:
lang = Language.fromguess(language_sentence)
except LanguageReverseError:
continue
match_lang = to_match(c, lang)
remaining_sentence = blank_match(match_lang)
for combination in to_combinations(to_words(remaining_sentence), self.country_max_words):
sentence = to_sentence(combination)
country = self._find_country(sentence)
if country:
try:
# discard country if value is actually the language name
Language.fromguess(country.raw)
except LanguageReverseError:
lang = Language(lang.alpha3, country=country.value, script=lang.script)
break
region = self._find_region(sentence)
if region:
lang = Language(lang.alpha3, country=lang.country, script=region.value)
break
script = self._find_script(sentence)
if script:
lang = Language(lang.alpha3, country=lang.country, script=script.value)
break
if implicit_accepted and implicit_lang.value.alpha3 == lang.alpha3 and not lang.country and not lang.script:
return implicit_lang
if context.accept(lang):
return to_match(c, lang)
if implicit_accepted:
return implicit_lang
def find(self, value: str, context: Context):
match = self.find_language(value, context)
if match:
return match.start, match.end, {'value': match.value}
def language(config: Config):
rebulk = Rebulk()
rebulk.functional(LanguageFinder(config).find, name='language')
return rebulk

@ -0,0 +1,32 @@
import re
from functools import partial
from rebulk import Rebulk
from rebulk.validators import chars_surround
from trakit.config import Config
from trakit.language import language
from trakit.words import seps
def configure(config: Config):
seps_surround = partial(chars_surround, seps)
others = Rebulk()
others.defaults(ignore_case=True, validator=seps_surround)
others.regex_defaults(flags=re.IGNORECASE,
abbreviations=[(r'-', rf'[{re.escape("".join(seps))}]')],
validator=seps_surround)
for name in ('forced', 'commentary', 'external'):
others.string(name, name=name, value=True)
others.string('sdh', name='hearing_impaired', value=True)
others.string('alternate', name='version', value='alternate')
others.string('descriptive', name='descriptive', value=True)
others.regex('cc', 'closed-captions?', name='closed_caption', value=True)
rebulk = Rebulk()
rebulk.rebulk(language(config))
rebulk.rebulk(others)
return rebulk

@ -0,0 +1,99 @@
import re
import typing
from rebulk.match import Match
seps = frozenset(r' [](){}+*|=-_~#/\\.,;:' + '\uff08\uff09')
suppress_chars = frozenset("'")
release_name_re = re.compile(r'(?P<release>[^\.\s]+(?:\.[^\.\s]+){2,})')
def to_words(value: str,
separators: typing.FrozenSet[str] = seps,
ignore_chars: typing.FrozenSet[str] = suppress_chars,
predicate: typing.Callable[[str], bool] = lambda x: True):
input_string = value
start = 0
i = 0
word = ''
words: typing.List[Match] = []
for c in input_string:
i += 1
if c in ignore_chars:
continue
if c not in separators:
word += c
continue
if not word:
start = i
continue
end = i - 1
if not predicate(value[start:end]):
input_string = blank(input_string, start, end)
else:
words.append(Match(start, i - 1, value=word))
word = ''
start = i
if word:
if not predicate(value[start:]):
input_string = blank(input_string, start, len(input_string))
else:
words.append(Match(start, i, value=word))
for w in words:
w.input_string = input_string
return words
def to_combinations(words: typing.List[Match], max_items: int):
results: typing.List[typing.List[Match]] = []
n_words = len(words)
cur_size = min(max_items, n_words)
start = 0
while cur_size > 0:
end = start + cur_size
if end > n_words:
start = 0
cur_size -= 1
continue
results.append(words[start:end])
start += 1
return results
def to_sentence(combination: typing.List[Match]):
return ' '.join([c.value for c in combination])
def to_match(combination: typing.List[Match], value: typing.Any):
start = combination[0].start
end = combination[-1].end
input_string = combination[0].input_string
return Match(start, end, value=value, input_string=input_string)
def blank(string: str, start: int, end: int):
return string[:start] + ''.ljust(end - start, ' ') + string[end:]
def blank_match(match: Match):
return blank(match.input_string, match.start, match.end)
def blank_release_names(value: str):
result = value
match = release_name_re.search(value)
while match:
result = blank(result, match.start('release'), match.end('release'))
match = release_name_re.search(value, match.end('release'))
return result

@ -17,7 +17,7 @@ ga4mp==2.0.4
guess_language-spirit==0.5.3 guess_language-spirit==0.5.3
guessit==3.5.0 guessit==3.5.0
jsonschema==4.17.0 jsonschema==4.17.0
knowit==0.4.0 knowit==0.5.2
peewee==3.15.3 peewee==3.15.3
py-pretty==1 py-pretty==1
pycountry==22.3.5 pycountry==22.3.5
@ -80,8 +80,9 @@ zipp==3.10.0
markupsafe==2.1.1 markupsafe==2.1.1
# Required-by: knowit # Required-by: knowit
pymediainfo==5.1.0 pymediainfo==6.0.1
pyyaml==6.0 pyyaml==6.0
trakit==0.2.1
# Required-by: python-socketio # Required-by: python-socketio
bidict==0.22.0 bidict==0.22.0

Loading…
Cancel
Save