@ -7,7 +7,8 @@ from zipfile import ZipFile, is_zipfile
from rarfile import RarFile , is_rarfile
from requests import Session
from ftfy import fix_text
import chardet
from bs4 import UnicodeDammit
from subzero . language import Language
from subliminal_patch . providers import Provider
@ -33,7 +34,6 @@ class SubtitriIdSubtitle(Subtitle):
self . year = year
self . imdb_id = imdb_id
self . matches = None
# self.encoding = 'utf-16'
@property
def id ( self ) :
@ -55,6 +55,39 @@ class SubtitriIdSubtitle(Subtitle):
self . matches = matches
return matches
def guess_encoding ( self ) :
# override default subtitle guess_encoding method to not include language-specific encodings guessing
# chardet encoding detection seem to yield better results
""" Guess encoding using chardet.
: return : the guessed encoding .
: rtype : str
"""
if self . _guessed_encoding :
return self . _guessed_encoding
logger . info ( ' Guessing encoding for language %s ' , self . language )
# guess/detect encoding using chardet
encoding = chardet . detect ( self . content ) [ ' encoding ' ]
logger . info ( ' Chardet found encoding %s ' , encoding )
if not encoding :
# fallback on bs4
logger . info ( ' Falling back to bs4 detection ' )
a = UnicodeDammit ( self . content )
logger . info ( " bs4 detected encoding: %s " , a . original_encoding )
if a . original_encoding :
self . _guessed_encoding = a . original_encoding
return a . original_encoding
raise ValueError ( u " Couldn ' t guess the proper encoding for %s " , self )
self . _guessed_encoding = encoding
return encoding
class SubtitriIdProvider ( Provider , ProviderSubtitleArchiveMixin ) :
""" subtitri.id.lv Provider. """
@ -155,7 +188,4 @@ class SubtitriIdProvider(Provider, ProviderSubtitleArchiveMixin):
raise ProviderError ( ' Unidentified archive type ' )
subtitle_content = self . get_subtitle_from_archive ( subtitle , archive )
# fix content encoding (utf-16 encoded by default)
fixed_subtitle_content = fix_text ( subtitle_content . decode ( ' utf-16 ' ) , { ' uncurl_quotes ' : False , ' fix_character_width ' : False } ) . encode ( encoding = ' utf-8 ' )
subtitle . content = fixed_subtitle_content
subtitle . content = self . get_subtitle_from_archive ( subtitle , archive )