bazarr/libs/ffsubsync/subtitle_parser.py

# -*- coding: utf-8 -*-
from datetime import timedelta
import logging
from typing import Any, cast, List, Optional

try:
    import cchardet
except:  # noqa: E722
    cchardet = None
try:
    import chardet
except:  # noqa: E722
    chardet = None
try:
    import charset_normalizer
except:  # noqa: E722
    charset_normalizer = None
import pysubs2
from ffsubsync.sklearn_shim import TransformerMixin
import srt

from ffsubsync.constants import (
    DEFAULT_ENCODING,
    DEFAULT_MAX_SUBTITLE_SECONDS,
    DEFAULT_START_SECONDS,
)
from ffsubsync.file_utils import open_file
from ffsubsync.generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin

logging.basicConfig(level=logging.INFO)
logger: logging.Logger = logging.getLogger(__name__)


def _preprocess_subs(
    subs,
    max_subtitle_seconds: Optional[int] = None,
    start_seconds: int = 0,
    tolerant: bool = True,
) -> List[GenericSubtitle]:
    subs_list = []
    start_time = timedelta(seconds=start_seconds)
    max_duration = timedelta(days=1)
    if max_subtitle_seconds is not None:
        max_duration = timedelta(seconds=max_subtitle_seconds)
    subs = iter(subs)
    while True:
        try:
            next_sub = GenericSubtitle.wrap_inner_subtitle(next(subs))
            if next_sub.start < start_time:
                continue
            next_sub.end = min(next_sub.end, next_sub.start + max_duration)
            subs_list.append(next_sub)
        # We don't catch SRTParseError here b/c that is typically raised when we
        # are trying to parse with the wrong encoding, in which case we might
        # be able to try another one on the *entire* set of subtitles elsewhere.
        except ValueError as e:
            if tolerant:
                logger.warning(e)
                continue
            else:
                raise
        except StopIteration:
            break
    return subs_list


class GenericSubtitleParser(SubsMixin, TransformerMixin):
    def __init__(
        self,
        fmt: str = "srt",
        encoding: str = "infer",
        caching: bool = False,
        max_subtitle_seconds: Optional[int] = None,
        start_seconds: int = 0,
        skip_ssa_info: bool = False,
        strict: bool = False,
    ) -> None:
        super(self.__class__, self).__init__()
        self.sub_format: str = fmt
        self.encoding: str = encoding
        self.caching: bool = caching
        self.fit_fname: Optional[str] = None
        self.detected_encoding_: Optional[str] = None
        self.max_subtitle_seconds: Optional[int] = max_subtitle_seconds
        self.start_seconds: int = start_seconds
        # FIXME: hack to get tests to pass; remove
        self._skip_ssa_info: bool = skip_ssa_info
        self._strict: bool = strict

    def fit(self, fname: str, *_) -> "GenericSubtitleParser":
        if self.caching and self.fit_fname == ("<stdin>" if fname is None else fname):
            return self
        encodings_to_try = (self.encoding,)
        with open_file(fname, "rb") as f:
            subs = f.read()
        if self.encoding == "infer":
            for chardet_lib in (cchardet, charset_normalizer, chardet):
                if chardet_lib is not None:
                    try:
                        detected_encoding = cast(
                            Optional[str], chardet_lib.detect(subs)["encoding"]
                        )
                    except:  # noqa: E722
                        continue
                    if detected_encoding is not None:
                        self.detected_encoding_ = detected_encoding
                        encodings_to_try = (detected_encoding,)
                        break
            assert self.detected_encoding_ is not None
            logger.info("detected encoding: %s" % self.detected_encoding_)
        exc = None
        for encoding in encodings_to_try:
            try:
                decoded_subs = subs.decode(encoding, errors="replace").strip()
                if self.sub_format == "srt":
                    parsed_subs = srt.parse(
                        decoded_subs, ignore_errors=not self._strict
                    )
                elif self.sub_format in ("ass", "ssa", "sub"):
                    parsed_subs = pysubs2.SSAFile.from_string(decoded_subs)
                else:
                    raise NotImplementedError(
                        "unsupported format: %s" % self.sub_format
                    )
                extra_generic_subtitle_file_kwargs = {}
                if isinstance(parsed_subs, pysubs2.SSAFile):
                    extra_generic_subtitle_file_kwargs.update(
                        dict(
                            styles=parsed_subs.styles,
                            # pysubs2 on Python >= 3.6 doesn't support this
                            fonts_opaque=getattr(parsed_subs, "fonts_opaque", None),
                            info=parsed_subs.info if not self._skip_ssa_info else None,
                        )
                    )
                self.subs_ = GenericSubtitlesFile(
                    _preprocess_subs(
                        parsed_subs,
                        max_subtitle_seconds=self.max_subtitle_seconds,
                        start_seconds=self.start_seconds,
                    ),
                    sub_format=self.sub_format,
                    encoding=encoding,
                    **extra_generic_subtitle_file_kwargs,
                )
                self.fit_fname = "<stdin>" if fname is None else fname
                if len(encodings_to_try) > 1:
                    self.detected_encoding_ = encoding
                    logger.info("detected encoding: %s" % self.detected_encoding_)
                return self
            except Exception as e:
                exc = e
                continue
        raise exc

    def transform(self, *_) -> GenericSubtitlesFile:
        return self.subs_


def make_subtitle_parser(
    fmt: str,
    encoding: str = DEFAULT_ENCODING,
    caching: bool = False,
    max_subtitle_seconds: int = DEFAULT_MAX_SUBTITLE_SECONDS,
    start_seconds: int = DEFAULT_START_SECONDS,
    **kwargs: Any,
) -> GenericSubtitleParser:
    return GenericSubtitleParser(
        fmt=fmt,
        encoding=encoding,
        caching=caching,
        max_subtitle_seconds=max_subtitle_seconds,
        start_seconds=start_seconds,
        skip_ssa_info=kwargs.get("skip_ssa_info", False),
        strict=kwargs.get("strict", False),
    )
Subsync first implementation (only after download/upload). 5 years ago			`# -- coding: utf-8 --`
			`from datetime import timedelta`
			`import logging`
Improved subtitles synchronisation settings and added a manual sync modal 11 months ago			`from typing import Any, cast, List, Optional`
Subsync first implementation (only after download/upload). 5 years ago
Added fallback to chardet if cchardet is not available. 5 years ago			`try:`
Improved subtitles synchronisation settings and added a manual sync modal 11 months ago			`import cchardet`
			`except: # noqa: E722`
			`cchardet = None`
			`try:`
			`import chardet`
			`except: # noqa: E722`
			`chardet = None`
			`try:`
			`import charset_normalizer`
			`except: # noqa: E722`
			`charset_normalizer = None`
Subsync first implementation (only after download/upload). 5 years ago			`import pysubs2`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`from ffsubsync.sklearn_shim import TransformerMixin`
Subsync first implementation (only after download/upload). 5 years ago			`import srt`

Improved subtitles synchronisation settings and added a manual sync modal 11 months ago			`from ffsubsync.constants import (`
			`DEFAULT_ENCODING,`
			`DEFAULT_MAX_SUBTITLE_SECONDS,`
			`DEFAULT_START_SECONDS,`
			`)`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`from ffsubsync.file_utils import open_file`
			`from ffsubsync.generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin`
Subsync first implementation (only after download/upload). 5 years ago
			`logging.basicConfig(level=logging.INFO)`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`logger: logging.Logger = logging.getLogger(__name__)`
Subsync first implementation (only after download/upload). 5 years ago

Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`def _preprocess_subs(`
			`subs,`
			`max_subtitle_seconds: Optional[int] = None,`
			`start_seconds: int = 0,`
			`tolerant: bool = True,`
			`) -> List[GenericSubtitle]:`
Subsync first implementation (only after download/upload). 5 years ago			`subs_list = []`
			`start_time = timedelta(seconds=start_seconds)`
			`max_duration = timedelta(days=1)`
			`if max_subtitle_seconds is not None:`
			`max_duration = timedelta(seconds=max_subtitle_seconds)`
			`subs = iter(subs)`
			`while True:`
			`try:`
			`next_sub = GenericSubtitle.wrap_inner_subtitle(next(subs))`
			`if next_sub.start < start_time:`
			`continue`
			`next_sub.end = min(next_sub.end, next_sub.start + max_duration)`
			`subs_list.append(next_sub)`
			`# We don't catch SRTParseError here b/c that is typically raised when we`
			`# are trying to parse with the wrong encoding, in which case we might`
			`# be able to try another one on the entire set of subtitles elsewhere.`
			`except ValueError as e:`
			`if tolerant:`
			`logger.warning(e)`
			`continue`
			`else:`
			`raise`
			`except StopIteration:`
			`break`
			`return subs_list`


			`class GenericSubtitleParser(SubsMixin, TransformerMixin):`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`def __init__(`
			`self,`
			`fmt: str = "srt",`
			`encoding: str = "infer",`
			`caching: bool = False,`
			`max_subtitle_seconds: Optional[int] = None,`
			`start_seconds: int = 0,`
			`skip_ssa_info: bool = False,`
Improved subtitles synchronisation settings and added a manual sync modal 11 months ago			`strict: bool = False,`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`) -> None:`
Subsync first implementation (only after download/upload). 5 years ago			`super(self.__class__, self).__init__()`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`self.sub_format: str = fmt`
			`self.encoding: str = encoding`
			`self.caching: bool = caching`
			`self.fit_fname: Optional[str] = None`
			`self.detected_encoding_: Optional[str] = None`
			`self.max_subtitle_seconds: Optional[int] = max_subtitle_seconds`
			`self.start_seconds: int = start_seconds`
			`# FIXME: hack to get tests to pass; remove`
			`self._skip_ssa_info: bool = skip_ssa_info`
Improved subtitles synchronisation settings and added a manual sync modal 11 months ago			`self._strict: bool = strict`
Subsync first implementation (only after download/upload). 5 years ago
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`def fit(self, fname: str, *_) -> "GenericSubtitleParser":`
			`if self.caching and self.fit_fname == ("<stdin>" if fname is None else fname):`
Subsync first implementation (only after download/upload). 5 years ago			`return self`
			`encodings_to_try = (self.encoding,)`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`with open_file(fname, "rb") as f:`
Subsync first implementation (only after download/upload). 5 years ago			`subs = f.read()`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`if self.encoding == "infer":`
Improved subtitles synchronisation settings and added a manual sync modal 11 months ago			`for chardet_lib in (cchardet, charset_normalizer, chardet):`
			`if chardet_lib is not None:`
			`try:`
			`detected_encoding = cast(`
			`Optional[str], chardet_lib.detect(subs)["encoding"]`
			`)`
			`except: # noqa: E722`
			`continue`
			`if detected_encoding is not None:`
			`self.detected_encoding_ = detected_encoding`
			`encodings_to_try = (detected_encoding,)`
			`break`
			`assert self.detected_encoding_ is not None`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`logger.info("detected encoding: %s" % self.detected_encoding_)`
Subsync first implementation (only after download/upload). 5 years ago			`exc = None`
			`for encoding in encodings_to_try:`
			`try:`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`decoded_subs = subs.decode(encoding, errors="replace").strip()`
			`if self.sub_format == "srt":`
Improved subtitles synchronisation settings and added a manual sync modal 11 months ago			`parsed_subs = srt.parse(`
			`decoded_subs, ignore_errors=not self._strict`
			`)`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`elif self.sub_format in ("ass", "ssa", "sub"):`
Subsync first implementation (only after download/upload). 5 years ago			`parsed_subs = pysubs2.SSAFile.from_string(decoded_subs)`
			`else:`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`raise NotImplementedError(`
			`"unsupported format: %s" % self.sub_format`
			`)`
			`extra_generic_subtitle_file_kwargs = {}`
			`if isinstance(parsed_subs, pysubs2.SSAFile):`
			`extra_generic_subtitle_file_kwargs.update(`
			`dict(`
			`styles=parsed_subs.styles,`
			`# pysubs2 on Python >= 3.6 doesn't support this`
			`fonts_opaque=getattr(parsed_subs, "fonts_opaque", None),`
			`info=parsed_subs.info if not self._skip_ssa_info else None,`
			`)`
			`)`
Subsync first implementation (only after download/upload). 5 years ago			`self.subs_ = GenericSubtitlesFile(`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`_preprocess_subs(`
			`parsed_subs,`
			`max_subtitle_seconds=self.max_subtitle_seconds,`
			`start_seconds=self.start_seconds,`
			`),`
Subsync first implementation (only after download/upload). 5 years ago			`sub_format=self.sub_format,`
Update ffsubsync and srt module * Update ffsubsync to 0.4.11 * Update srt to 3.4.1 4 years ago			`encoding=encoding,`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`**extra_generic_subtitle_file_kwargs,`
Subsync first implementation (only after download/upload). 5 years ago			`)`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`self.fit_fname = "<stdin>" if fname is None else fname`
Added on demand subtitles synchronization. 5 years ago			`if len(encodings_to_try) > 1:`
			`self.detected_encoding_ = encoding`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`logger.info("detected encoding: %s" % self.detected_encoding_)`
Subsync first implementation (only after download/upload). 5 years ago			`return self`
			`except Exception as e:`
			`exc = e`
			`continue`
			`raise exc`

Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`def transform(self, *_) -> GenericSubtitlesFile:`
Subsync first implementation (only after download/upload). 5 years ago			`return self.subs_`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago

			`def make_subtitle_parser(`
			`fmt: str,`
			`encoding: str = DEFAULT_ENCODING,`
			`caching: bool = False,`
			`max_subtitle_seconds: int = DEFAULT_MAX_SUBTITLE_SECONDS,`
			`start_seconds: int = DEFAULT_START_SECONDS,`
			`**kwargs: Any,`
			`) -> GenericSubtitleParser:`
			`return GenericSubtitleParser(`
			`fmt=fmt,`
			`encoding=encoding,`
			`caching=caching,`
			`max_subtitle_seconds=max_subtitle_seconds,`
			`start_seconds=start_seconds,`
			`skip_ssa_info=kwargs.get("skip_ssa_info", False),`
Improved subtitles synchronisation settings and added a manual sync modal 11 months ago			`strict=kwargs.get("strict", False),`
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`)`