From a7b40eaf79936dc71b35398311ea4398d520982d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Fri, 20 Sep 2019 17:56:33 -0400 Subject: [PATCH] WIP --- bazarr/get_subtitle.py | 3 +- libs/pysubs2/common.py | 2 +- libs/pysubs2/formats.py | 3 ++ libs/pysubs2/ssafile.py | 10 ++++- libs/pysubs2/ssastyle.py | 2 +- libs/pysubs2/subrip.py | 20 ++++----- libs/pysubs2/substation.py | 17 ++++--- libs/pysubs2/time.py | 14 ++++++ libs/pysubs2/txt_generic.py | 45 ------------------- libs/subliminal_patch/core.py | 4 +- libs/subliminal_patch/http.py | 6 +-- libs/subliminal_patch/pitcher.py | 2 +- libs/subliminal_patch/providers/addic7ed.py | 22 ++++----- .../subliminal_patch/providers/hosszupuska.py | 2 +- libs/subliminal_patch/providers/legendastv.py | 4 +- libs/subliminal_patch/providers/mixins.py | 10 +---- libs/subliminal_patch/providers/subscene.py | 8 ++-- 17 files changed, 75 insertions(+), 99 deletions(-) delete mode 100644 libs/pysubs2/txt_generic.py diff --git a/bazarr/get_subtitle.py b/bazarr/get_subtitle.py index 195929d4a..980bf9b35 100644 --- a/bazarr/get_subtitle.py +++ b/bazarr/get_subtitle.py @@ -202,7 +202,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce directory=fld, chmod=chmod, # formats=("srt", "vtt") - path_decoder=force_unicode + path_decoder=None ) except Exception as e: logging.exception('BAZARR Error saving subtitles file to disk for this file:' + path) @@ -419,7 +419,6 @@ def manual_download_subtitle(path, language, hi, forced, subtitle, provider, pro if not subtitle.is_valid(): logging.exception('BAZARR No valid subtitles file found for this file: ' + path) return - logging.debug('BAZARR Subtitles file downloaded for this file:' + path) try: score = round(subtitle.score / max_score * 100, 2) fld = get_target_folder(path) diff --git a/libs/pysubs2/common.py b/libs/pysubs2/common.py index 2f95ccf44..4688e5df4 100644 --- a/libs/pysubs2/common.py +++ b/libs/pysubs2/common.py @@ -17,7 +17,7 @@ class Color(_Color): return _Color.__new__(cls, r, g, b, a) #: Version of the pysubs2 library. -VERSION = "0.2.3" +VERSION = "0.2.4" PY3 = sys.version_info.major == 3 diff --git a/libs/pysubs2/formats.py b/libs/pysubs2/formats.py index 5c25a6e96..869a3b6c7 100644 --- a/libs/pysubs2/formats.py +++ b/libs/pysubs2/formats.py @@ -4,6 +4,7 @@ from .subrip import SubripFormat from .jsonformat import JSONFormat from .substation import SubstationFormat from .mpl2 import MPL2Format +from .tmp import TmpFormat from .exceptions import * #: Dict mapping file extensions to format identifiers. @@ -13,6 +14,7 @@ FILE_EXTENSION_TO_FORMAT_IDENTIFIER = { ".ssa": "ssa", ".sub": "microdvd", ".json": "json", + ".txt": "tmp", } #: Dict mapping format identifiers to implementations (FormatBase subclasses). @@ -23,6 +25,7 @@ FORMAT_IDENTIFIER_TO_FORMAT_CLASS = { "microdvd": MicroDVDFormat, "json": JSONFormat, "mpl2": MPL2Format, + "tmp": TmpFormat, } def get_format_class(format_): diff --git a/libs/pysubs2/ssafile.py b/libs/pysubs2/ssafile.py index c6a668439..390a31b54 100644 --- a/libs/pysubs2/ssafile.py +++ b/libs/pysubs2/ssafile.py @@ -66,7 +66,14 @@ class SSAFile(MutableSequence): be detected from the file, in which case you don't need to specify it here (when given, this argument overrides autodetection). - kwargs: Extra options for the parser. + keep_unknown_html_tags (bool): This affects SubRip only (SRT), + for other formats this argument is ignored. + By default, HTML tags are converted to equivalent SubStation tags + (eg. ```` to ``{\\i1}`` and any remaining tags are removed + to keep the text clean. Set this parameter to ``True`` + if you want to pass through these tags (eg. ````). + This is useful if your output format is SRT and your player + supports these tags. Returns: SSAFile @@ -86,6 +93,7 @@ class SSAFile(MutableSequence): Example: >>> subs1 = pysubs2.load("subrip-subtitles.srt") >>> subs2 = pysubs2.load("microdvd-subtitles.sub", fps=23.976) + >>> subs3 = pysubs2.load("subrip-subtitles-with-fancy-tags.srt", keep_unknown_html_tags=True) """ with open(path, encoding=encoding) as fp: diff --git a/libs/pysubs2/ssastyle.py b/libs/pysubs2/ssastyle.py index 2fcadc7ed..eb59b74b5 100644 --- a/libs/pysubs2/ssastyle.py +++ b/libs/pysubs2/ssastyle.py @@ -56,7 +56,7 @@ class SSAStyle(object): self.encoding = 1 #: Charset for k, v in fields.items(): - if k in self.FIELDS and v is not None: + if k in self.FIELDS: setattr(self, k, v) else: raise ValueError("SSAStyle has no field named %r" % k) diff --git a/libs/pysubs2/subrip.py b/libs/pysubs2/subrip.py index fea4eade6..70cb96fe5 100644 --- a/libs/pysubs2/subrip.py +++ b/libs/pysubs2/subrip.py @@ -31,7 +31,7 @@ class SubripFormat(FormatBase): return "srt" @classmethod - def from_file(cls, subs, fp, format_, **kwargs): + def from_file(cls, subs, fp, format_, keep_unknown_html_tags=False, **kwargs): timestamps = [] # (start, end) following_lines = [] # contains lists of lines following each timestamp @@ -56,15 +56,15 @@ class SubripFormat(FormatBase): # Handle the general case. s = "".join(lines).strip() s = re.sub(r"\n+ *\d+ *$", "", s) # strip number of next subtitle - s = re.sub(r"< *i *>", r"{\i1}", s) - s = re.sub(r"< */ *i *>", r"{\i0}", s) - s = re.sub(r"< *s *>", r"{\s1}", s) - s = re.sub(r"< */ *s *>", r"{\s0}", s) - s = re.sub(r"< *u *>", "{\\u1}", s) # not r" for Python 2.7 compat, triggers unicodeescape - s = re.sub(r"< */ *u *>", "{\\u0}", s) - s = re.sub(r"< */? *[a-zA-Z][^>]*>", "", s) # strip other HTML tags - s = re.sub(r"\r", "", s) # convert newlines - s = re.sub(r"\n", r"\N", s) # convert newlines + s = re.sub(r"< *i *>", r"{\\i1}", s) + s = re.sub(r"< */ *i *>", r"{\\i0}", s) + s = re.sub(r"< *s *>", r"{\\s1}", s) + s = re.sub(r"< */ *s *>", r"{\\s0}", s) + s = re.sub(r"< *u *>", "{\\\\u1}", s) # not r" for Python 2.7 compat, triggers unicodeescape + s = re.sub(r"< */ *u *>", "{\\\\u0}", s) + if not keep_unknown_html_tags: + s = re.sub(r"< */? *[a-zA-Z][^>]*>", "", s) # strip other HTML tags + s = re.sub(r"\n", r"\\N", s) # convert newlines return s subs.events = [SSAEvent(start=start, end=end, text=prepare_text(lines)) diff --git a/libs/pysubs2/substation.py b/libs/pysubs2/substation.py index f810a4776..8563f8a0d 100644 --- a/libs/pysubs2/substation.py +++ b/libs/pysubs2/substation.py @@ -145,7 +145,12 @@ class SubstationFormat(FormatBase): def string_to_field(f, v): if f in {"start", "end"}: - return timestamp_to_ms(TIMESTAMP.match(v).groups()) + if v.startswith("-"): + # handle negative timestamps + v = v[1:] + return -timestamp_to_ms(TIMESTAMP.match(v).groups()) + else: + return timestamp_to_ms(TIMESTAMP.match(v).groups()) elif "color" in f: if format_ == "ass": return ass_rgba_to_color(v) @@ -184,22 +189,22 @@ class SubstationFormat(FormatBase): elif inside_info_section or inside_aegisub_section: if line.startswith(";"): continue # skip comments try: - k, v = line.split(": ", 1) + k, v = line.split(":", 1) if inside_info_section: - subs.info[k] = v + subs.info[k] = v.strip() elif inside_aegisub_section: - subs.aegisub_project[k] = v + subs.aegisub_project[k] = v.strip() except ValueError: pass elif line.startswith("Style:"): - _, rest = line.split(": ", 1) + _, rest = line.split(":", 1) buf = rest.strip().split(",") name, raw_fields = buf[0], buf[1:] # splat workaround for Python 2.7 field_dict = {f: string_to_field(f, v) for f, v in zip(STYLE_FIELDS[format_], raw_fields)} sty = SSAStyle(**field_dict) subs.styles[name] = sty elif line.startswith("Dialogue:") or line.startswith("Comment:"): - ev_type, rest = line.split(": ", 1) + ev_type, rest = line.split(":", 1) raw_fields = rest.strip().split(",", len(EVENT_FIELDS[format_])-1) field_dict = {f: string_to_field(f, v) for f, v in zip(EVENT_FIELDS[format_], raw_fields)} field_dict["type"] = ev_type diff --git a/libs/pysubs2/time.py b/libs/pysubs2/time.py index 46d349f85..24e9ec077 100644 --- a/libs/pysubs2/time.py +++ b/libs/pysubs2/time.py @@ -49,6 +49,20 @@ def timestamp_to_ms(groups): ms += h * 3600000 return ms +def tmptimestamp_to_ms(groups): + """ + Convert groups from :data:`pysubs2.time.TMPTIMESTAMP` match to milliseconds. + + Example: + >>> timestamp_to_ms(TIMESTAMP.match("0:00:01").groups()) + 1000 + + """ + h, m, s = map(int, groups) + ms = s * 1000 + ms += m * 60000 + ms += h * 3600000 + return ms def times_to_ms(h=0, m=0, s=0, ms=0): """ Convert hours, minutes, seconds to milliseconds. diff --git a/libs/pysubs2/txt_generic.py b/libs/pysubs2/txt_generic.py deleted file mode 100644 index 70bf3e31c..000000000 --- a/libs/pysubs2/txt_generic.py +++ /dev/null @@ -1,45 +0,0 @@ -# coding=utf-8 - -from __future__ import print_function, division, unicode_literals -import re -from numbers import Number - -from pysubs2.time import times_to_ms -from .formatbase import FormatBase -from .ssaevent import SSAEvent -from .ssastyle import SSAStyle - - -# thanks to http://otsaloma.io/gaupol/doc/api/aeidon.files.mpl2_source.html -MPL2_FORMAT = re.compile(r"^(?um)\[(-?\d+)\]\[(-?\d+)\](.*?)$") - - -class TXTGenericFormat(FormatBase): - @classmethod - def guess_format(cls, text): - if MPL2_FORMAT.match(text): - return "mpl2" - - -class MPL2Format(FormatBase): - @classmethod - def guess_format(cls, text): - return TXTGenericFormat.guess_format(text) - - @classmethod - def from_file(cls, subs, fp, format_, **kwargs): - def prepare_text(lines): - out = [] - for s in lines.split("|"): - if s.startswith("/"): - out.append(r"{\i1}%s{\i0}" % s[1:]) - continue - out.append(s) - return "\n".join(out) - - subs.events = [SSAEvent(start=times_to_ms(s=float(start) / 10), end=times_to_ms(s=float(end) / 10), - text=prepare_text(text)) for start, end, text in MPL2_FORMAT.findall(fp.getvalue())] - - @classmethod - def to_file(cls, subs, fp, format_, **kwargs): - raise NotImplemented diff --git a/libs/subliminal_patch/core.py b/libs/subliminal_patch/core.py index c5f73fb48..bd5fff217 100644 --- a/libs/subliminal_patch/core.py +++ b/libs/subliminal_patch/core.py @@ -854,8 +854,8 @@ def save_subtitles(file_path, subtitles, single=False, directory=None, chmod=Non logger.debug(u"Saving %r to %r", subtitle, subtitle_path) content = subtitle.get_modified_content(format=format, debug=debug_mods) if content: - with open(subtitle_path, 'w') as f: - f.write(content.decode('utf-8')) + with open(subtitle_path, 'wb') as f: + f.write(content) subtitle.storage_path = subtitle_path else: logger.error(u"Something went wrong when getting modified subtitle for %s", subtitle) diff --git a/libs/subliminal_patch/http.py b/libs/subliminal_patch/http.py index 7ed8ef4ef..5018b3b27 100644 --- a/libs/subliminal_patch/http.py +++ b/libs/subliminal_patch/http.py @@ -148,7 +148,7 @@ class CFSession(CloudScraper): cache_key = "cf_data3_%s" % domain if not self.cookies.get("cf_clearance", "", domain=domain): - cf_data = region.get(cache_key) + cf_data = str(region.get(cache_key)) if cf_data is not NO_VALUE: cf_cookies, hdrs = cf_data logger.debug("Trying to use old cf data for %s: %s", domain, cf_data) @@ -165,9 +165,9 @@ class CFSession(CloudScraper): pass else: if cf_data and "cf_clearance" in cf_data[0] and cf_data[0]["cf_clearance"]: - if cf_data != region.get(cache_key): + if cf_data != str(region.get(cache_key)): logger.debug("Storing cf data for %s: %s", domain, cf_data) - region.set(cache_key, cf_data) + region.set(cache_key, bytearray(cf_data, encoding='utf-8')) elif cf_data[0]["cf_clearance"]: logger.debug("CF Live tokens not updated") diff --git a/libs/subliminal_patch/pitcher.py b/libs/subliminal_patch/pitcher.py index df3c89007..be1302943 100644 --- a/libs/subliminal_patch/pitcher.py +++ b/libs/subliminal_patch/pitcher.py @@ -257,4 +257,4 @@ def load_verification(site_name, session, callback=lambda x: None): def store_verification(site_name, session): - region.set("%s_data" % site_name, session.cookies._cookies, session.headers["User-Agent"]) + region.set("%s_data" % site_name, (session.cookies._cookies, session.headers["User-Agent"])) diff --git a/libs/subliminal_patch/providers/addic7ed.py b/libs/subliminal_patch/providers/addic7ed.py index 55b4a9d7b..d3cce89f5 100644 --- a/libs/subliminal_patch/providers/addic7ed.py +++ b/libs/subliminal_patch/providers/addic7ed.py @@ -104,11 +104,11 @@ class Addic7edProvider(_Addic7edProvider): tries = 0 while tries < 3: r = self.session.get(self.server_url + 'login.php', timeout=10, headers={"Referer": self.server_url}) - if "grecaptcha" in r.content: + if "grecaptcha" in r.text: logger.info('Addic7ed: Solving captcha. This might take a couple of minutes, but should only ' 'happen once every so often') - site_key = re.search(r'grecaptcha.execute\(\'(.+?)\',', r.content).group(1) + site_key = re.search(r'grecaptcha.execute\(\'(.+?)\',', r.text).group(1) if not site_key: logger.error("Addic7ed: Captcha site-key not found!") return @@ -127,11 +127,11 @@ class Addic7edProvider(_Addic7edProvider): r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10, headers={"Referer": self.server_url + "login.php"}) - if "relax, slow down" in r.content: + if "relax, slow down" in r.text: raise TooManyRequests(self.username) if r.status_code != 302: - if "User doesn't exist" in r.content and tries <= 2: + if "User doesn't exist" in r.text and tries <= 2: logger.info("Addic7ed: Error, trying again. (%s/%s)", tries+1, 3) tries += 1 continue @@ -208,8 +208,8 @@ class Addic7edProvider(_Addic7edProvider): if show_cells: soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser']) else: - # If RegEx fails, fall back to original r.content and use 'html.parser' - soup = ParserBeautifulSoup(r.content, ['html.parser']) + # If RegEx fails, fall back to original r.text and use 'html.parser' + soup = ParserBeautifulSoup(r.text, ['html.parser']) # populate the show ids show_ids = {} @@ -265,7 +265,7 @@ class Addic7edProvider(_Addic7edProvider): r = self.session.get(self.server_url + endpoint, params=params, timeout=10, headers=headers) r.raise_for_status() - if r.content and "Sorry, your search" not in r.content: + if r.text and "Sorry, your search" not in r.text: break time.sleep(4) @@ -273,7 +273,7 @@ class Addic7edProvider(_Addic7edProvider): if r.status_code == 304: raise TooManyRequests() - soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + soup = ParserBeautifulSoup(r.text, ['lxml', 'html.parser']) suggestion = None @@ -315,13 +315,13 @@ class Addic7edProvider(_Addic7edProvider): if r.status_code == 304: raise TooManyRequests() - if not r.content: + if not r.text: # Provider wrongful return a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.error('No data returned from provider') return [] - soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + soup = ParserBeautifulSoup(r.text, ['lxml', 'html.parser']) # loop over subtitle rows subtitles = [] @@ -364,7 +364,7 @@ class Addic7edProvider(_Addic7edProvider): if r.status_code == 304: raise TooManyRequests() - if not r.content: + if not r.text: # Provider wrongful return a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.error('Unable to download subtitle. No data returned from provider') diff --git a/libs/subliminal_patch/providers/hosszupuska.py b/libs/subliminal_patch/providers/hosszupuska.py index 5f6724146..62cf77bc8 100644 --- a/libs/subliminal_patch/providers/hosszupuska.py +++ b/libs/subliminal_patch/providers/hosszupuska.py @@ -116,7 +116,7 @@ class HosszupuskaSubtitle(Subtitle): if video.format and self.version and video.format.lower() in self.version.lower(): matches.add('format') # other properties - matches |= guess_matches(video, guessit(self.release_info.encode("utf-8"))) + matches |= guess_matches(video, guessit(self.release_info)) return matches diff --git a/libs/subliminal_patch/providers/legendastv.py b/libs/subliminal_patch/providers/legendastv.py index 3fe71ab1d..cab6867d6 100644 --- a/libs/subliminal_patch/providers/legendastv.py +++ b/libs/subliminal_patch/providers/legendastv.py @@ -199,7 +199,7 @@ class LegendasTVProvider(_LegendasTVProvider): # attempt to get the releases from the cache cache_key = releases_key.format(archive_id=a.id, archive_name=a.name) - releases = region.get(cache_key, expiration_time=expiration_time) + releases = str(region.get(cache_key, expiration_time=expiration_time)) # the releases are not in cache or cache is expired if releases == NO_VALUE: @@ -226,7 +226,7 @@ class LegendasTVProvider(_LegendasTVProvider): releases.append(name) # cache the releases - region.set(cache_key, releases) + region.set(cache_key, bytearray(releases, encoding='utf-8')) # iterate over releases for r in releases: diff --git a/libs/subliminal_patch/providers/mixins.py b/libs/subliminal_patch/providers/mixins.py index 6b13bad0e..98299e02c 100644 --- a/libs/subliminal_patch/providers/mixins.py +++ b/libs/subliminal_patch/providers/mixins.py @@ -158,13 +158,5 @@ class ProviderSubtitleArchiveMixin(object): elif subs_fallback: matching_sub = subs_fallback[0] - try: - matching_sub_unicode = matching_sub.decode("utf-8") - except UnicodeDecodeError: - try: - matching_sub_unicode = matching_sub.decode("cp437") - except UnicodeDecodeError: - matching_sub_unicode = matching_sub.decode("utf-8", errors='replace') - - logger.info(u"Using %s from the archive", matching_sub_unicode) + logger.info(u"Using %s from the archive", matching_sub) return fix_line_ending(archive.read(matching_sub)) diff --git a/libs/subliminal_patch/providers/subscene.py b/libs/subliminal_patch/providers/subscene.py index 3ee2609d2..fdfca1af8 100644 --- a/libs/subliminal_patch/providers/subscene.py +++ b/libs/subliminal_patch/providers/subscene.py @@ -141,7 +141,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): logger.info("Creating session") self.session = RetryingCFSession() - prev_cookies = region.get("subscene_cookies2") + prev_cookies = str(region.get("subscene_cookies2")) if prev_cookies != NO_VALUE: logger.debug("Re-using old subscene cookies: %r", prev_cookies) self.session.cookies.update(prev_cookies) @@ -194,7 +194,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): del cj[cn] logger.debug("Storing cookies: %r", cj) - region.set("subscene_cookies2", cj) + region.set("subscene_cookies2", bytearray(cj, encoding='utf-8')) return raise ProviderError("Something went wrong when trying to log in #1") @@ -219,9 +219,9 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin): acc_filters["SelectedIds"] = selected_ids self.filters["LanguageFilter"] = ",".join(acc_filters["SelectedIds"]) - last_filters = region.get("subscene_filters") + last_filters = str(region.get("subscene_filters")) if last_filters != acc_filters: - region.set("subscene_filters", acc_filters) + region.set("subscene_filters", bytearray(acc_filters, encoding='utf-8')) logger.debug("Setting account filters to %r", acc_filters) self.session.post("https://u.subscene.com/filter", acc_filters, allow_redirects=False)