Switched to UnicodeDammit instead of chardet which give me better result #37

7 years ago · 98de479389
parent 47261c0c91
commit 98de479389
1 changed files with 3 additions and 3 deletions
--- a/list_subtitles.py
+++ b/list_subtitles.py
@ -6,7 +6,7 @@ import pycountry
 import sqlite3
 import ast
 import langdetect
-import chardet
+from bs4 import UnicodeDammit
 from itertools import islice
 from get_general_settings import *
@ -38,9 +38,9 @@ def store_subtitles(file):
                with open(path_replace(os.path.join(os.path.dirname(file), subtitle)), 'r') as f:
                    text = list(islice(f, 20))
                    text = ' '.join(text)
-                    encoding = chardet.detect(text)['encoding']
+                    encoding = UnicodeDammit(text)
                    try:
-                        text = text.decode(encoding)
+                        text = text.decode(encoding.original_encoding)
                    except Exception as e:
                        logging.exception('Error trying to detect character encoding for this subtitles file: ' + path_replace(os.path.join(os.path.dirname(file), subtitle)))
                    else: