from functools import lru_cache
from logging import getLogger
from typing import List , Optional
from . constant import (
COMMON_SAFE_ASCII_CHARACTERS ,
TRACE ,
UNICODE_SECONDARY_RANGE_KEYWORD ,
)
from . utils import (
is_accentuated ,
is_arabic ,
is_arabic_isolated_form ,
is_case_variable ,
is_cjk ,
is_emoticon ,
is_hangul ,
is_hiragana ,
is_katakana ,
is_latin ,
is_punctuation ,
is_separator ,
is_symbol ,
is_thai ,
is_unprintable ,
remove_accent ,
unicode_range ,
)
class MessDetectorPlugin :
"""
Base abstract class used for mess detection plugins .
All detectors MUST extend and implement given methods .
"""
def eligible ( self , character : str ) - > bool :
"""
Determine if given character should be fed in .
"""
raise NotImplementedError # pragma: nocover
def feed ( self , character : str ) - > None :
"""
The main routine to be executed upon character .
Insert the logic in witch the text would be considered chaotic .
"""
raise NotImplementedError # pragma: nocover
def reset ( self ) - > None : # pragma: no cover
"""
Permit to reset the plugin to the initial state .
"""
raise NotImplementedError
@property
def ratio ( self ) - > float :
"""
Compute the chaos ratio based on what your feed ( ) has seen .
Must NOT be lower than 0. ; No restriction gt 0.
"""
raise NotImplementedError # pragma: nocover
class TooManySymbolOrPunctuationPlugin ( MessDetectorPlugin ) :
def __init__ ( self ) - > None :
self . _punctuation_count : int = 0
self . _symbol_count : int = 0
self . _character_count : int = 0
self . _last_printable_char : Optional [ str ] = None
self . _frenzy_symbol_in_word : bool = False
def eligible ( self , character : str ) - > bool :
return character . isprintable ( )
def feed ( self , character : str ) - > None :
self . _character_count + = 1
if (
character != self . _last_printable_char
and character not in COMMON_SAFE_ASCII_CHARACTERS
) :
if is_punctuation ( character ) :
self . _punctuation_count + = 1
elif (
character . isdigit ( ) is False
and is_symbol ( character )
and is_emoticon ( character ) is False
) :
self . _symbol_count + = 2
self . _last_printable_char = character
def reset ( self ) - > None : # pragma: no cover
self . _punctuation_count = 0
self . _character_count = 0
self . _symbol_count = 0
@property
def ratio ( self ) - > float :
if self . _character_count == 0 :
return 0.0
ratio_of_punctuation : float = (
self . _punctuation_count + self . _symbol_count
) / self . _character_count
return ratio_of_punctuation if ratio_of_punctuation > = 0.3 else 0.0
class TooManyAccentuatedPlugin ( MessDetectorPlugin ) :
def __init__ ( self ) - > None :
self . _character_count : int = 0
self . _accentuated_count : int = 0
def eligible ( self , character : str ) - > bool :
return character . isalpha ( )
def feed ( self , character : str ) - > None :
self . _character_count + = 1
if is_accentuated ( character ) :
self . _accentuated_count + = 1
def reset ( self ) - > None : # pragma: no cover
self . _character_count = 0
self . _accentuated_count = 0
@property
def ratio ( self ) - > float :
if self . _character_count < 8 :
return 0.0
ratio_of_accentuation : float = self . _accentuated_count / self . _character_count
return ratio_of_accentuation if ratio_of_accentuation > = 0.35 else 0.0
class UnprintablePlugin ( MessDetectorPlugin ) :
def __init__ ( self ) - > None :
self . _unprintable_count : int = 0
self . _character_count : int = 0
def eligible ( self , character : str ) - > bool :
return True
def feed ( self , character : str ) - > None :
if is_unprintable ( character ) :
self . _unprintable_count + = 1
self . _character_count + = 1
def reset ( self ) - > None : # pragma: no cover
self . _unprintable_count = 0
@property
def ratio ( self ) - > float :
if self . _character_count == 0 :
return 0.0
return ( self . _unprintable_count * 8 ) / self . _character_count
class SuspiciousDuplicateAccentPlugin ( MessDetectorPlugin ) :
def __init__ ( self ) - > None :
self . _successive_count : int = 0
self . _character_count : int = 0
self . _last_latin_character : Optional [ str ] = None
def eligible ( self , character : str ) - > bool :
return character . isalpha ( ) and is_latin ( character )
def feed ( self , character : str ) - > None :
self . _character_count + = 1
if (
self . _last_latin_character is not None
and is_accentuated ( character )
and is_accentuated ( self . _last_latin_character )
) :
if character . isupper ( ) and self . _last_latin_character . isupper ( ) :
self . _successive_count + = 1
# Worse if its the same char duplicated with different accent.
if remove_accent ( character ) == remove_accent ( self . _last_latin_character ) :
self . _successive_count + = 1
self . _last_latin_character = character
def reset ( self ) - > None : # pragma: no cover
self . _successive_count = 0
self . _character_count = 0
self . _last_latin_character = None
@property
def ratio ( self ) - > float :
if self . _character_count == 0 :
return 0.0
return ( self . _successive_count * 2 ) / self . _character_count
class SuspiciousRange ( MessDetectorPlugin ) :
def __init__ ( self ) - > None :
self . _suspicious_successive_range_count : int = 0
self . _character_count : int = 0
self . _last_printable_seen : Optional [ str ] = None
def eligible ( self , character : str ) - > bool :
return character . isprintable ( )
def feed ( self , character : str ) - > None :
self . _character_count + = 1
if (
character . isspace ( )
or is_punctuation ( character )
or character in COMMON_SAFE_ASCII_CHARACTERS
) :
self . _last_printable_seen = None
return
if self . _last_printable_seen is None :
self . _last_printable_seen = character
return
unicode_range_a : Optional [ str ] = unicode_range ( self . _last_printable_seen )
unicode_range_b : Optional [ str ] = unicode_range ( character )
if is_suspiciously_successive_range ( unicode_range_a , unicode_range_b ) :
self . _suspicious_successive_range_count + = 1
self . _last_printable_seen = character
def reset ( self ) - > None : # pragma: no cover
self . _character_count = 0
self . _suspicious_successive_range_count = 0
self . _last_printable_seen = None
@property
def ratio ( self ) - > float :
if self . _character_count < = 24 :
return 0.0
ratio_of_suspicious_range_usage : float = (
self . _suspicious_successive_range_count * 2
) / self . _character_count
return ratio_of_suspicious_range_usage
class SuperWeirdWordPlugin ( MessDetectorPlugin ) :
def __init__ ( self ) - > None :
self . _word_count : int = 0
self . _bad_word_count : int = 0
self . _foreign_long_count : int = 0
self . _is_current_word_bad : bool = False
self . _foreign_long_watch : bool = False
self . _character_count : int = 0
self . _bad_character_count : int = 0
self . _buffer : str = " "
self . _buffer_accent_count : int = 0
def eligible ( self , character : str ) - > bool :
return True
def feed ( self , character : str ) - > None :
if character . isalpha ( ) :
self . _buffer + = character
if is_accentuated ( character ) :
self . _buffer_accent_count + = 1
if (
self . _foreign_long_watch is False
and ( is_latin ( character ) is False or is_accentuated ( character ) )
and is_cjk ( character ) is False
and is_hangul ( character ) is False
and is_katakana ( character ) is False
and is_hiragana ( character ) is False
and is_thai ( character ) is False
) :
self . _foreign_long_watch = True
return
if not self . _buffer :
return
if (
character . isspace ( ) or is_punctuation ( character ) or is_separator ( character )
) and self . _buffer :
self . _word_count + = 1
buffer_length : int = len ( self . _buffer )
self . _character_count + = buffer_length
if buffer_length > = 4 :
if self . _buffer_accent_count / buffer_length > 0.34 :
self . _is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
if (
is_accentuated ( self . _buffer [ - 1 ] )
and self . _buffer [ - 1 ] . isupper ( )
and all ( _ . isupper ( ) for _ in self . _buffer ) is False
) :
self . _foreign_long_count + = 1
self . _is_current_word_bad = True
if buffer_length > = 24 and self . _foreign_long_watch :
camel_case_dst = [
i
for c , i in zip ( self . _buffer , range ( 0 , buffer_length ) )
if c . isupper ( )
]
probable_camel_cased : bool = False
if camel_case_dst and ( len ( camel_case_dst ) / buffer_length < = 0.3 ) :
probable_camel_cased = True
if not probable_camel_cased :
self . _foreign_long_count + = 1
self . _is_current_word_bad = True
if self . _is_current_word_bad :
self . _bad_word_count + = 1
self . _bad_character_count + = len ( self . _buffer )
self . _is_current_word_bad = False
self . _foreign_long_watch = False
self . _buffer = " "
self . _buffer_accent_count = 0
elif (
character not in { " < " , " > " , " - " , " = " , " ~ " , " | " , " _ " }
and character . isdigit ( ) is False
and is_symbol ( character )
) :
self . _is_current_word_bad = True
self . _buffer + = character
def reset ( self ) - > None : # pragma: no cover
self . _buffer = " "
self . _is_current_word_bad = False
self . _foreign_long_watch = False
self . _bad_word_count = 0
self . _word_count = 0
self . _character_count = 0
self . _bad_character_count = 0
self . _foreign_long_count = 0
@property
def ratio ( self ) - > float :
if self . _word_count < = 10 and self . _foreign_long_count == 0 :
return 0.0
return self . _bad_character_count / self . _character_count
class CjkInvalidStopPlugin ( MessDetectorPlugin ) :
"""
GB ( Chinese ) based encoding often render the stop incorrectly when the content does not fit and
can be easily detected . Searching for the overuse of ' 丅 ' and ' 丄 ' .
"""
def __init__ ( self ) - > None :
self . _wrong_stop_count : int = 0
self . _cjk_character_count : int = 0
def eligible ( self , character : str ) - > bool :
return True
def feed ( self , character : str ) - > None :
if character in { " 丅 " , " 丄 " } :
self . _wrong_stop_count + = 1
return
if is_cjk ( character ) :
self . _cjk_character_count + = 1
def reset ( self ) - > None : # pragma: no cover
self . _wrong_stop_count = 0
self . _cjk_character_count = 0
@property
def ratio ( self ) - > float :
if self . _cjk_character_count < 16 :
return 0.0
return self . _wrong_stop_count / self . _cjk_character_count
class ArchaicUpperLowerPlugin ( MessDetectorPlugin ) :
def __init__ ( self ) - > None :
self . _buf : bool = False
self . _character_count_since_last_sep : int = 0
self . _successive_upper_lower_count : int = 0
self . _successive_upper_lower_count_final : int = 0
self . _character_count : int = 0
self . _last_alpha_seen : Optional [ str ] = None
self . _current_ascii_only : bool = True
def eligible ( self , character : str ) - > bool :
return True
def feed ( self , character : str ) - > None :
is_concerned = character . isalpha ( ) and is_case_variable ( character )
chunk_sep = is_concerned is False
if chunk_sep and self . _character_count_since_last_sep > 0 :
if (
self . _character_count_since_last_sep < = 64
and character . isdigit ( ) is False
and self . _current_ascii_only is False
) :
self . _successive_upper_lower_count_final + = (
self . _successive_upper_lower_count
)
self . _successive_upper_lower_count = 0
self . _character_count_since_last_sep = 0
self . _last_alpha_seen = None
self . _buf = False
self . _character_count + = 1
self . _current_ascii_only = True
return
if self . _current_ascii_only is True and character . isascii ( ) is False :
self . _current_ascii_only = False
if self . _last_alpha_seen is not None :
if ( character . isupper ( ) and self . _last_alpha_seen . islower ( ) ) or (
character . islower ( ) and self . _last_alpha_seen . isupper ( )
) :
if self . _buf is True :
self . _successive_upper_lower_count + = 2
self . _buf = False
else :
self . _buf = True
else :
self . _buf = False
self . _character_count + = 1
self . _character_count_since_last_sep + = 1
self . _last_alpha_seen = character
def reset ( self ) - > None : # pragma: no cover
self . _character_count = 0
self . _character_count_since_last_sep = 0
self . _successive_upper_lower_count = 0
self . _successive_upper_lower_count_final = 0
self . _last_alpha_seen = None
self . _buf = False
self . _current_ascii_only = True
@property
def ratio ( self ) - > float :
if self . _character_count == 0 :
return 0.0
return self . _successive_upper_lower_count_final / self . _character_count
class ArabicIsolatedFormPlugin ( MessDetectorPlugin ) :
def __init__ ( self ) - > None :
self . _character_count : int = 0
self . _isolated_form_count : int = 0
def reset ( self ) - > None : # pragma: no cover
self . _character_count = 0
self . _isolated_form_count = 0
def eligible ( self , character : str ) - > bool :
return is_arabic ( character )
def feed ( self , character : str ) - > None :
self . _character_count + = 1
if is_arabic_isolated_form ( character ) :
self . _isolated_form_count + = 1
@property
def ratio ( self ) - > float :
if self . _character_count < 8 :
return 0.0
isolated_form_usage : float = self . _isolated_form_count / self . _character_count
return isolated_form_usage
@lru_cache ( maxsize = 1024 )
def is_suspiciously_successive_range (
unicode_range_a : Optional [ str ] , unicode_range_b : Optional [ str ]
) - > bool :
"""
Determine if two Unicode range seen next to each other can be considered as suspicious .
"""
if unicode_range_a is None or unicode_range_b is None :
return True
if unicode_range_a == unicode_range_b :
return False
if " Latin " in unicode_range_a and " Latin " in unicode_range_b :
return False
if " Emoticons " in unicode_range_a or " Emoticons " in unicode_range_b :
return False
# Latin characters can be accompanied with a combining diacritical mark
# eg. Vietnamese.
if ( " Latin " in unicode_range_a or " Latin " in unicode_range_b ) and (
" Combining " in unicode_range_a or " Combining " in unicode_range_b
) :
return False
keywords_range_a , keywords_range_b = unicode_range_a . split (
" "
) , unicode_range_b . split ( " " )
for el in keywords_range_a :
if el in UNICODE_SECONDARY_RANGE_KEYWORD :
continue
if el in keywords_range_b :
return False
# Japanese Exception
range_a_jp_chars , range_b_jp_chars = (
unicode_range_a
in (
" Hiragana " ,
" Katakana " ,
) ,
unicode_range_b in ( " Hiragana " , " Katakana " ) ,
)
if ( range_a_jp_chars or range_b_jp_chars ) and (
" CJK " in unicode_range_a or " CJK " in unicode_range_b
) :
return False
if range_a_jp_chars and range_b_jp_chars :
return False
if " Hangul " in unicode_range_a or " Hangul " in unicode_range_b :
if " CJK " in unicode_range_a or " CJK " in unicode_range_b :
return False
if unicode_range_a == " Basic Latin " or unicode_range_b == " Basic Latin " :
return False
# Chinese/Japanese use dedicated range for punctuation and/or separators.
if ( " CJK " in unicode_range_a or " CJK " in unicode_range_b ) or (
unicode_range_a in [ " Katakana " , " Hiragana " ]
and unicode_range_b in [ " Katakana " , " Hiragana " ]
) :
if " Punctuation " in unicode_range_a or " Punctuation " in unicode_range_b :
return False
if " Forms " in unicode_range_a or " Forms " in unicode_range_b :
return False
if unicode_range_a == " Basic Latin " or unicode_range_b == " Basic Latin " :
return False
return True
@lru_cache ( maxsize = 2048 )
def mess_ratio (
decoded_sequence : str , maximum_threshold : float = 0.2 , debug : bool = False
) - > float :
"""
Compute a mess ratio given a decoded bytes sequence . The maximum threshold does stop the computation earlier .
"""
detectors : List [ MessDetectorPlugin ] = [
md_class ( ) for md_class in MessDetectorPlugin . __subclasses__ ( )
]
length : int = len ( decoded_sequence ) + 1
mean_mess_ratio : float = 0.0
if length < 512 :
intermediary_mean_mess_ratio_calc : int = 32
elif length < = 1024 :
intermediary_mean_mess_ratio_calc = 64
else :
intermediary_mean_mess_ratio_calc = 128
for character , index in zip ( decoded_sequence + " \n " , range ( length ) ) :
for detector in detectors :
if detector . eligible ( character ) :
detector . feed ( character )
if (
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
) or index == length - 1 :
mean_mess_ratio = sum ( dt . ratio for dt in detectors )
if mean_mess_ratio > = maximum_threshold :
break
if debug :
logger = getLogger ( " charset_normalizer " )
logger . log (
TRACE ,
" Mess-detector extended-analysis start. "
f " intermediary_mean_mess_ratio_calc= { intermediary_mean_mess_ratio_calc } mean_mess_ratio= { mean_mess_ratio } "
f " maximum_threshold= { maximum_threshold } " ,
)
if len ( decoded_sequence ) > 16 :
logger . log ( TRACE , f " Starting with: { decoded_sequence [ : 16 ] } " )
logger . log ( TRACE , f " Ending with: { decoded_sequence [ - 16 : : ] } " )
for dt in detectors : # pragma: nocover
logger . log ( TRACE , f " { dt . __class__ } : { dt . ratio } " )
return round ( mean_mess_ratio , 3 )