Switched to UnicodeDammit instead of chardet which give me better result #37

pull/56/merge
morpheus65535 7 years ago
parent 47261c0c91
commit 98de479389

@ -6,7 +6,7 @@ import pycountry
import sqlite3 import sqlite3
import ast import ast
import langdetect import langdetect
import chardet from bs4 import UnicodeDammit
from itertools import islice from itertools import islice
from get_general_settings import * from get_general_settings import *
@ -38,9 +38,9 @@ def store_subtitles(file):
with open(path_replace(os.path.join(os.path.dirname(file), subtitle)), 'r') as f: with open(path_replace(os.path.join(os.path.dirname(file), subtitle)), 'r') as f:
text = list(islice(f, 20)) text = list(islice(f, 20))
text = ' '.join(text) text = ' '.join(text)
encoding = chardet.detect(text)['encoding'] encoding = UnicodeDammit(text)
try: try:
text = text.decode(encoding) text = text.decode(encoding.original_encoding)
except Exception as e: except Exception as e:
logging.exception('Error trying to detect character encoding for this subtitles file: ' + path_replace(os.path.join(os.path.dirname(file), subtitle))) logging.exception('Error trying to detect character encoding for this subtitles file: ' + path_replace(os.path.join(os.path.dirname(file), subtitle)))
else: else:

Loading…
Cancel
Save