diff --git a/libs/subliminal_patch/subtitle.py b/libs/subliminal_patch/subtitle.py index 1b3ce002a..8116697bf 100644 --- a/libs/subliminal_patch/subtitle.py +++ b/libs/subliminal_patch/subtitle.py @@ -20,6 +20,15 @@ from subliminal import Subtitle as Subtitle_ from subliminal.subtitle import Episode, Movie, sanitize_release_group, get_equivalent_release_groups from subliminal_patch.utils import sanitize from ftfy import fix_text +from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE + +BOMS = ( + (BOM_UTF8, "UTF-8"), + (BOM_UTF32_BE, "UTF-32-BE"), + (BOM_UTF32_LE, "UTF-32-LE"), + (BOM_UTF16_BE, "UTF-16-BE"), + (BOM_UTF16_LE, "UTF-16-LE"), +) logger = logging.getLogger(__name__) @@ -106,6 +115,9 @@ class Subtitle(Subtitle_): # normalize line endings self.content = self.content.replace(b"\r\n", b"\n").replace(b'\r', b'\n') + def _check_bom(self, data): + return [encoding for bom, encoding in BOMS if data.startswith(bom)] + def guess_encoding(self): """Guess encoding using the language, falling back on chardet. @@ -120,6 +132,11 @@ class Subtitle(Subtitle_): encodings = ['utf-8'] + # check UTF BOMs + bom_encodings = self._check_bom(self.content) + if bom_encodings: + encodings = list(set(enc.lower() for enc in bom_encodings + encodings)) + # add language-specific encodings # http://scratchpad.wikia.com/wiki/Character_Encoding_Recommendation_for_Languages