"""
This gives other modules access to the gritty details about characters and the
encodings that use them .
"""
import html
import itertools
import re
import unicodedata
# These are the encodings we will try to fix in ftfy, in the
# order that they should be tried.
CHARMAP_ENCODINGS = [
" latin-1 " ,
" sloppy-windows-1252 " ,
" sloppy-windows-1251 " ,
" sloppy-windows-1250 " ,
" sloppy-windows-1253 " ,
" sloppy-windows-1254 " ,
" iso-8859-2 " ,
" macroman " ,
" cp437 " ,
]
SINGLE_QUOTE_RE = re . compile ( " [ \u02bc \u2018 - \u201b ] " )
DOUBLE_QUOTE_RE = re . compile ( " [ \u201c - \u201f ] " )
def _build_regexes ( ) :
"""
ENCODING_REGEXES contain reasonably fast ways to detect if we
could represent a given string in a given encoding . The simplest one is
the ' ascii ' detector , which of course just determines if all characters
are between U + 0000 and U + 007 F .
"""
# Define a regex that matches ASCII text.
encoding_regexes = { " ascii " : re . compile ( " ^[ \x00 - \x7f ]*$ " ) }
for encoding in CHARMAP_ENCODINGS :
# Make a sequence of characters that bytes \x80 to \xFF decode to
# in each encoding, as well as byte \x1A, which is used to represent
# the replacement character <20> in the sloppy-* encodings.
byte_range = bytes ( list ( range ( 0x80 , 0x100 ) ) + [ 0x1A ] )
charlist = byte_range . decode ( encoding )
# The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B
# to \x7F -- will decode as those ASCII characters in any encoding we
# support, so we can just include them as ranges. This also lets us
# not worry about escaping regex special characters, because all of
# them are in the \x1B to \x7F range.
regex = " ^[ \x00 - \x19 \x1b - \x7f {0} ]*$ " . format ( charlist )
encoding_regexes [ encoding ] = re . compile ( regex )
return encoding_regexes
ENCODING_REGEXES = _build_regexes ( )
def _build_html_entities ( ) :
entities = { }
# Create a dictionary based on the built-in HTML5 entity dictionary.
# Add a limited set of HTML entities that we'll also decode if they've
# been case-folded to uppercase, such as decoding &NTILDE; as "Ñ".
for name , char in html . entities . html5 . items ( ) :
if name . endswith ( " ; " ) :
entities [ " & " + name ] = char
# Restrict the set of characters we can attempt to decode if their
# name has been uppercased. If we tried to handle all entity names,
# the results would be ambiguous.
if name == name . lower ( ) :
name_upper = name . upper ( )
entity_upper = " & " + name_upper
if html . unescape ( entity_upper ) == entity_upper :
entities [ entity_upper ] = char . upper ( )
return entities
HTML_ENTITY_RE = re . compile ( r " &#?[0-9A-Za-z] { 1,24}; " )
HTML_ENTITIES = _build_html_entities ( )
def possible_encoding ( text , encoding ) :
"""
Given text and a single - byte encoding , check whether that text could have
been decoded from that single - byte encoding .
In other words , check whether it can be encoded in that encoding , possibly
sloppily .
"""
return bool ( ENCODING_REGEXES [ encoding ] . match ( text ) )
def _build_control_char_mapping ( ) :
"""
Build a translate mapping that strips likely - unintended control characters .
See : func : ` ftfy . fixes . remove_control_chars ` for a description of these
codepoint ranges and why they should be removed .
"""
control_chars = { }
for i in itertools . chain (
range ( 0x00 , 0x09 ) ,
[ 0x0B ] ,
range ( 0x0E , 0x20 ) ,
[ 0x7F ] ,
range ( 0x206A , 0x2070 ) ,
[ 0xFEFF ] ,
range ( 0xFFF9 , 0xFFFD ) ,
) :
control_chars [ i ] = None
return control_chars
CONTROL_CHARS = _build_control_char_mapping ( )
# Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0'
# that some Windows-1252 program converted to a plain space.
#
# The smaller values are included on a case-by-case basis, because we don't want
# to decode likely input sequences to unlikely characters. These are the ones
# that *do* form likely characters before 0xa0:
#
# 0xc2 -> U+A0 NO-BREAK SPACE
# 0xc3 -> U+E0 LATIN SMALL LETTER A WITH GRAVE
# 0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON
# 0xce -> U+3A0 GREEK CAPITAL LETTER PI
# 0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER
# 0xd9 -> U+660 ARABIC-INDIC DIGIT ZERO
#
# In three-character sequences, we exclude some lead bytes in some cases.
#
# When the lead byte is immediately followed by 0xA0, we shouldn't accept
# a space there, because it leads to some less-likely character ranges:
#
# 0xe0 -> Samaritan script
# 0xe1 -> Mongolian script (corresponds to Latin-1 'á' which is too common)
#
# We accept 0xe2 and 0xe3, which cover many scripts. Bytes 0xe4 and
# higher point mostly to CJK characters, which we generally don't want to
# decode near Latin lowercase letters.
#
# In four-character sequences, the lead byte must be F0, because that accounts
# for almost all of the usage of high-numbered codepoints (tag characters whose
# UTF-8 starts with the byte F3 are only used in some rare new emoji sequences).
#
# This is meant to be applied to encodings of text that tests true for `is_bad`.
# Any of these could represent characters that legitimately appear surrounded by
# spaces, particularly U+C5 (Å), which is a word in multiple languages!
#
# We should consider checking for b'\x85' being converted to ... in the future.
# I've seen it once, but the text still wasn't recoverable.
ALTERED_UTF8_RE = re . compile (
b " [ \xc2 \xc3 \xc5 \xce \xd0 \xd9 ][ ] "
b " |[ \xe2 \xe3 ][ ][ \x80 - \x84 \x86 - \x9f \xa1 - \xbf ] "
b " |[ \xe0 - \xe3 ][ \x80 - \x84 \x86 - \x9f \xa1 - \xbf ][ ] "
b " |[ \xf0 ][ ][ \x80 - \xbf ][ \x80 - \xbf ] "
b " |[ \xf0 ][ \x80 - \xbf ][ ][ \x80 - \xbf ] "
b " |[ \xf0 ][ \x80 - \xbf ][ \x80 - \xbf ][ ] "
)
# This expression matches UTF-8 and CESU-8 sequences where some of the
# continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is
# used within ftfy to represent a byte that produced the replacement character
# \ufffd. We don't know which byte it was, but we can at least decode the UTF-8
# sequence as \ufffd instead of failing to re-decode it at all.
#
# In some cases, we allow the ASCII '?' in place of \ufffd, but at most once per
# sequence.
LOSSY_UTF8_RE = re . compile (
b " [ \xc2 - \xdf ][ \x1a ] "
b " |[ \xc2 - \xc3 ][?] "
b " | \xed [ \xa0 - \xaf ][ \x1a ?] \xed [ \xb0 - \xbf ][ \x1a ? \x80 - \xbf ] "
b " | \xed [ \xa0 - \xaf ][ \x1a ? \x80 - \xbf ] \xed [ \xb0 - \xbf ][ \x1a ?] "
b " |[ \xe0 - \xef ][ \x1a ?][ \x1a \x80 - \xbf ] "
b " |[ \xe0 - \xef ][ \x1a \x80 - \xbf ][ \x1a ?] "
b " |[ \xf0 - \xf4 ][ \x1a ?][ \x1a \x80 - \xbf ][ \x1a \x80 - \xbf ] "
b " |[ \xf0 - \xf4 ][ \x1a \x80 - \xbf ][ \x1a ?][ \x1a \x80 - \xbf ] "
b " |[ \xf0 - \xf4 ][ \x1a \x80 - \xbf ][ \x1a \x80 - \xbf ][ \x1a ?] "
b " | \x1a "
)
# This regex matches C1 control characters, which occupy some of the positions
# in the Latin-1 character map that Windows assigns to other characters instead.
C1_CONTROL_RE = re . compile ( r " [ \ x80- \ x9f] " )
# A translate mapping that breaks ligatures made of Latin letters. While
# ligatures may be important to the representation of other languages, in Latin
# letters they tend to represent a copy/paste error. It omits ligatures such
# as æ that are frequently used intentionally.
#
# This list additionally includes some Latin digraphs that represent two
# characters for legacy encoding reasons, not for typographical reasons.
#
# Ligatures and digraphs may also be separated by NFKC normalization, but that
# is sometimes more normalization than you want.
LIGATURES = {
ord ( " IJ " ) : " IJ " , # Dutch ligatures
ord ( " ij " ) : " ij " ,
ord ( " ʼn " ) : " ʼ n" , # Afrikaans digraph meant to avoid auto-curled quote
ord ( " DZ " ) : " DZ " , # Serbian/Croatian digraphs for Cyrillic conversion
ord ( " Dz " ) : " Dz " ,
ord ( " dz " ) : " dz " ,
ord ( " DŽ " ) : " DŽ " ,
ord ( " Dž " ) : " Dž " ,
ord ( " dž " ) : " dž " ,
ord ( " LJ " ) : " LJ " ,
ord ( " Lj " ) : " Lj " ,
ord ( " lj " ) : " lj " ,
ord ( " NJ " ) : " NJ " ,
ord ( " Nj " ) : " Nj " ,
ord ( " nj " ) : " nj " ,
ord ( " ff " ) : " ff " , # Latin typographical ligatures
ord ( " fi " ) : " fi " ,
ord ( " fl " ) : " fl " ,
ord ( " ffi " ) : " ffi " ,
ord ( " ffl " ) : " ffl " ,
ord ( " ſt " ) : " ſ t" ,
ord ( " st " ) : " st " ,
}
def _build_width_map ( ) :
"""
Build a translate mapping that replaces halfwidth and fullwidth forms
with their standard - width forms .
"""
# Though it's not listed as a fullwidth character, we'll want to convert
# U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start
# with that in the dictionary.
width_map = { 0x3000 : " " }
for i in range ( 0xFF01 , 0xFFF0 ) :
char = chr ( i )
alternate = unicodedata . normalize ( " NFKC " , char )
if alternate != char :
width_map [ i ] = alternate
return width_map
WIDTH_MAP = _build_width_map ( )
# Character classes that help us pinpoint embedded mojibake. These can
# include common characters, because we'll also check them for 'badness'.
UTF8_CLUES = {
# Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding
" utf8_first_of_2 " : (
" ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßĂĆČĎĐĘĚĞİĹŃŇŐŘŞŢŮŰ "
" ΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ "
) ,
# Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding
" utf8_first_of_3 " : ( " àáâãäåæçèéêëìíîïăćčďęěĺŕΰαβγδεζηθικλμνξοабвгдежзийклмноп " ) ,
# Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding.
# (Other leading bytes correspond only to unassigned codepoints)
" utf8_first_of_4 " : ( " ðóđğπσру " ) ,
# Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,
# including a space standing in for 0xA0
" utf8_continuation " : (
" \x80 - \xbf "
" ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅ "
" ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ "
" –—―‘’‚“”„†‡•…‰‹›€№™ "
" "
) ,
# Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,
# and don't usually stand for themselves when adjacent to mojibake.
# This excludes spaces, dashes, quotation marks, and ellipses.
" utf8_continuation_strict " : (
" \x80 - \xbf "
" ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅ "
" ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ "
" †‡•‰‹›€№™ "
) ,
}
# This regex uses UTF8_CLUES to find sequences of likely mojibake.
# It matches them with + so that several adjacent UTF-8-looking sequences
# get coalesced into one, allowing them to be fixed more efficiently
# and not requiring every individual subsequence to be detected as 'badness'.
#
# We accept spaces in place of "utf8_continuation", because spaces might have
# been intended to be U+A0 NO-BREAK SPACE.
#
# We do a lookbehind to make sure the previous character isn't a
# "utf8_continuation_strict" character, so that we don't fix just a few
# characters in a huge garble and make the situation worse.
#
# Unfortunately, the matches to this regular expression won't show their
# surrounding context, and including context would make the expression much
# less efficient. The 'badness' rules that require context, such as a preceding
# lowercase letter, will prevent some cases of inconsistent UTF-8 from being
# fixed when they don't see it.
UTF8_DETECTOR_RE = re . compile (
"""
( ? < ! [ { utf8_continuation_strict } ] )
(
[ { utf8_first_of_2 } ] [ { utf8_continuation } ]
|
[ { utf8_first_of_3 } ] [ { utf8_continuation } ] { { 2 } }
|
[ { utf8_first_of_4 } ] [ { utf8_continuation } ] { { 3 } }
) +
""" .format(
* * UTF8_CLUES
) ,
re . VERBOSE ,
)