"""
The ` ftfy . fixes ` module contains the individual fixes that : func : ` ftfy . fix_text `
can perform , and provides the functions that are named in " explanations "
such as the output of : func : ` ftfy . fix_and_explain ` .
Two of these functions are particularly useful on their own , as more robust
versions of functions in the Python standard library :
- : func : ` ftfy . fixes . decode_escapes `
- : func : ` ftfy . fixes . unescape_html `
"""
import codecs
import html
import re
import warnings
import ftfy
from ftfy . chardata import (
ALTERED_UTF8_RE ,
C1_CONTROL_RE ,
CONTROL_CHARS ,
DOUBLE_QUOTE_RE ,
HTML_ENTITIES ,
HTML_ENTITY_RE ,
LIGATURES ,
LOSSY_UTF8_RE ,
SINGLE_QUOTE_RE ,
UTF8_DETECTOR_RE ,
WIDTH_MAP ,
)
from ftfy . badness import is_bad
def fix_encoding_and_explain ( text ) :
"""
Deprecated copy of ` ftfy . fix_encoding_and_explain ( ) ` .
"""
warnings . warn (
" `fix_encoding_and_explain()` has moved to the main module of ftfy. " ,
DeprecationWarning ,
)
return ftfy . fix_encoding_and_explain ( text )
def fix_encoding ( text ) :
"""
Deprecated copy of ` ftfy . fix_encoding ( ) ` .
"""
warnings . warn (
" `fix_encoding()` has moved to the main module of ftfy. " , DeprecationWarning
)
return ftfy . fix_encoding ( text )
def apply_plan ( text , plan ) :
"""
Deprecated copy of ` ftfy . apply_plan ( ) ` .
"""
warnings . warn (
" `apply_plan()` has moved to the main module of ftfy. " , DeprecationWarning
)
return ftfy . apply_plan ( text , plan )
def _unescape_fixup ( match ) :
"""
Replace one matched HTML entity with the character it represents ,
if possible .
"""
text = match . group ( 0 )
if text in HTML_ENTITIES :
return HTML_ENTITIES [ text ]
elif text . startswith ( " &# " ) :
unescaped = html . unescape ( text )
# If html.unescape only decoded part of the string, that's not what
# we want. The semicolon should be consumed.
if " ; " in unescaped :
return text
else :
return unescaped
else :
return text
def unescape_html ( text ) :
"""
Decode HTML entities and character references , including some nonstandard
ones written in all - caps .
Python has a built - in called ` html . unescape ` that can decode HTML escapes ,
including a bunch of messy edge cases such as decoding escapes without
semicolons such as " & " .
If you know you ' ve got HTML-escaped text, applying `html.unescape` is the
right way to convert it to plain text . But in ambiguous situations , that
would create false positives . For example , the informally written text
" this¬ that " should not automatically be decoded as " this¬ that " .
In this function , we decode the escape sequences that appear in the
` html . entities . html5 ` dictionary , as long as they are the unambiguous ones
that end in semicolons .
We also decode all - caps versions of Latin letters and common symbols .
If a database contains the name ' PÉREZ ' , we can read that and intuit
that it was supposed to say ' PÉREZ ' . This is limited to a smaller set of
entities , because there are many instances where entity names are
case - sensitive in complicated ways .
>> > unescape_html ( ' <tag> ' )
' <tag> '
>> > unescape_html ( ' 𝒥ohn ℋancock ' )
' 𝒥 ohn ℋ ancock'
>> > unescape_html ( ' ✓ ' )
' ✓ '
>> > unescape_html ( ' Pérez ' )
' Pérez '
>> > unescape_html ( ' P&EACUTE;REZ ' )
' PÉREZ '
>> > unescape_html ( ' BUNDESSTRA&SZLIG;E ' )
' BUNDESSTRASSE '
>> > unescape_html ( ' ñ Ñ &NTILDE; &nTILDE; ' )
' ñ Ñ Ñ &nTILDE; '
"""
return HTML_ENTITY_RE . sub ( _unescape_fixup , text )
ANSI_RE = re . compile ( " \033 \\ [((?: \\ d|;)*)([a-zA-Z]) " )
def remove_terminal_escapes ( text ) :
r """
Strip out " ANSI " terminal escape sequences , such as those that produce
colored text on Unix .
>> > print ( remove_terminal_escapes (
. . . " \033 [36;44mI ' m blue, da ba dee da ba doo... \033 [0m "
. . . ) )
I ' m blue, da ba dee da ba doo...
"""
return ANSI_RE . sub ( " " , text )
def uncurl_quotes ( text ) :
r """
Replace curly quotation marks with straight equivalents .
>> > print ( uncurl_quotes ( ' \u201c here \u2019 s a test \u201d ' ) )
" here ' s a test "
"""
return SINGLE_QUOTE_RE . sub ( " ' " , DOUBLE_QUOTE_RE . sub ( ' " ' , text ) )
def fix_latin_ligatures ( text ) :
"""
Replace single - character ligatures of Latin letters , such as ' fi ' , with the
characters that they contain , as in ' fi ' . Latin ligatures are usually not
intended in text strings ( though they ' re lovely in *rendered* text). If
you have such a ligature in your string , it is probably a result of a
copy - and - paste glitch .
We leave ligatures in other scripts alone to be safe . They may be intended ,
and removing them may lose information . If you want to take apart nearly
all ligatures , use NFKC normalization .
>> > print ( fix_latin_ligatures ( " fluffiest " ) )
fluffiest
"""
return text . translate ( LIGATURES )
def fix_character_width ( text ) :
"""
The ASCII characters , katakana , and Hangul characters have alternate
" halfwidth " or " fullwidth " forms that help text line up in a grid .
If you don ' t need these width properties, you probably want to replace
these characters with their standard form , which is what this function
does .
Note that this replaces the ideographic space , U + 3000 , with the ASCII
space , U + 20.
>> > print ( fix_character_width ( " L O U D N O I S E S " ) )
LOUD NOISES
>> > print ( fix_character_width ( " Uターン " ) ) # this means "U-turn"
Uターン
"""
return text . translate ( WIDTH_MAP )
def fix_line_breaks ( text ) :
r """
Convert all line breaks to Unix style .
This will convert the following sequences into the standard \\n
line break :
- CRLF ( \\r \\n ) , used on Windows and in some communication protocols
- CR ( \\r ) , once used on Mac OS Classic , and now kept alive by misguided
software such as Microsoft Office for Mac
- LINE SEPARATOR ( \\u2028 ) and PARAGRAPH SEPARATOR ( \\u2029 ) , defined by
Unicode and used to sow confusion and discord
- NEXT LINE ( \\x85 ) , a C1 control character that is certainly not what you
meant
The NEXT LINE character is a bit of an odd case , because it
usually won ' t show up if `fix_encoding` is also being run.
\\x85 is very common mojibake for \\u2026 , HORIZONTAL ELLIPSIS .
>> > print ( fix_line_breaks (
. . . " This string is made of two things: \u2029 "
. . . " 1. Unicode \u2028 "
. . . " 2. Spite "
. . . ) )
This string is made of two things :
1. Unicode
2. Spite
For further testing and examples , let ' s define a function to make sure
we can see the control characters in their escaped form :
>> > def eprint ( text ) :
. . . print ( text . encode ( ' unicode-escape ' ) . decode ( ' ascii ' ) )
>> > eprint ( fix_line_breaks ( " Content-type: text/plain \r \n \r \n Hi. " ) )
Content - type : text / plain \n \nHi .
>> > eprint ( fix_line_breaks ( " This is how Microsoft \r trolls Mac users " ) )
This is how Microsoft \n trolls Mac users
>> > eprint ( fix_line_breaks ( " What is this \x85 I don ' t even " ) )
What is this \n I don ' t even
"""
return (
text . replace ( " \r \n " , " \n " )
. replace ( " \r " , " \n " )
. replace ( " \u2028 " , " \n " )
. replace ( " \u2029 " , " \n " )
. replace ( " \u0085 " , " \n " )
)
SURROGATE_RE = re . compile ( " [ \ud800 - \udfff ] " )
SURROGATE_PAIR_RE = re . compile ( " [ \ud800 - \udbff ][ \udc00 - \udfff ] " )
def convert_surrogate_pair ( match ) :
"""
Convert a surrogate pair to the single codepoint it represents .
This implements the formula described at :
http : / / en . wikipedia . org / wiki / Universal_Character_Set_characters #Surrogates
"""
pair = match . group ( 0 )
codept = 0x10000 + ( ord ( pair [ 0 ] ) - 0xD800 ) * 0x400 + ( ord ( pair [ 1 ] ) - 0xDC00 )
return chr ( codept )
def fix_surrogates ( text ) :
"""
Replace 16 - bit surrogate codepoints with the characters they represent
( when properly paired ) , or with \ufffd otherwise .
>> > high_surrogate = chr ( 0xd83d )
>> > low_surrogate = chr ( 0xdca9 )
>> > print ( fix_surrogates ( high_surrogate + low_surrogate ) )
💩
>> > print ( fix_surrogates ( low_surrogate + high_surrogate ) )
<EFBFBD> <EFBFBD>
The above doctest had to be very carefully written , because even putting
the Unicode escapes of the surrogates in the docstring was causing
various tools to fail , which I think just goes to show why this fixer is
necessary .
"""
if SURROGATE_RE . search ( text ) :
text = SURROGATE_PAIR_RE . sub ( convert_surrogate_pair , text )
text = SURROGATE_RE . sub ( " \ufffd " , text )
return text
def remove_control_chars ( text ) :
"""
Remove various control characters that you probably didn ' t intend to be in
your text . Many of these characters appear in the table of " Characters not
suitable for use with markup " at
http : / / www . unicode . org / reports / tr20 / tr20 - 9. html .
This includes :
- ASCII control characters , except for the important whitespace characters
( U + 00 to U + 08 , U + 0 B , U + 0 E to U + 1 F , U + 7 F )
- Deprecated Arabic control characters ( U + 206 A to U + 206 F )
- Interlinear annotation characters ( U + FFF9 to U + FFFB )
- The Object Replacement Character ( U + FFFC )
- The byte order mark ( U + FEFF )
However , these similar characters are left alone :
- Control characters that produce whitespace ( U + 09 , U + 0 A , U + 0 C , U + 0 D ,
U + 2028 , and U + 2029 )
- C1 control characters ( U + 80 to U + 9 F ) - - even though they are basically
never used intentionally , they are important clues about what mojibake
has happened
- Control characters that affect glyph rendering , such as joiners and
right - to - left marks ( U + 200 C to U + 200 F , U + 202 A to U + 202 E )
- Musical notation control characters ( U + 1 D173 to U + 1 D17A ) because wow if
you ' re using those you probably have a good reason
- Tag characters , because they are now used in emoji sequences such as
" Flag of Wales "
"""
return text . translate ( CONTROL_CHARS )
def remove_bom ( text ) :
r """
Remove a byte - order mark that was accidentally decoded as if it were part
of the text .
>> > print ( remove_bom ( chr ( 0xfeff ) + " Where do you want to go today? " ) )
Where do you want to go today ?
"""
return text . lstrip ( chr ( 0xFEFF ) )
# Define a regex to match valid escape sequences in Python string literals.
ESCAPE_SEQUENCE_RE = re . compile (
r """
( \\U . . . . . . . . # 8-digit hex escapes
| \\u . . . . # 4-digit hex escapes
| \\x . . # 2-digit hex escapes
| \\[ 0 - 7 ] { 1 , 3 } # Octal escapes
| \\N \{ [ ^ } ] + \} # Unicode characters by name
| \\[ \\' " abfnrtv] # Single-character escapes
) """ ,
re . UNICODE | re . VERBOSE ,
)
def decode_escapes ( text ) :
r """
Decode backslashed escape sequences , including \\x , \\u , and \\U character
references , even in the presence of other Unicode .
This function has to be called specifically . It ' s not run automatically by
ftfy , because escaped text is not necessarily a mistake , and there is no
way to distinguish when it is .
This is what Python ' s " string-escape " and " unicode-escape " codecs were
meant to do , but in contrast , this actually works . It will decode the
string exactly the same way that the Python interpreter decodes its string
literals .
>> > factoid = ' \\ u20a1 is the currency symbol for the colón. '
>> > print ( factoid [ 1 : ] )
u20a1 is the currency symbol for the colón .
>> > print ( decode_escapes ( factoid ) )
₡ is the currency symbol for the colón .
Even though Python itself can read string literals with a combination of
escapes and literal Unicode - - you ' re looking at one right now -- the
" unicode-escape " codec doesn ' t work on literal Unicode. (See
http : / / stackoverflow . com / a / 24519338 / 773754 for more details . )
Instead , this function searches for just the parts of a string that
represent escape sequences , and decodes them , leaving the rest alone . All
valid escape sequences are made of ASCII characters , and this allows
" unicode-escape " to work correctly .
"""
def decode_match ( match ) :
" Given a regex match, decode the escape sequence it contains. "
return codecs . decode ( match . group ( 0 ) , " unicode-escape " )
return ESCAPE_SEQUENCE_RE . sub ( decode_match , text )
# This regex implements an exception to restore_byte_a0, so we can decode the
# very common mojibake of (for example) "Ã la mode" as "à la mode", not "àla
# mode".
#
# If byte C3 appears with a single space after it -- most commonly this shows
# up as " Ã " appearing as an entire word -- we'll insert \xa0 while keeping
# the space. Without this change, we would decode "à" as the start of the next
# word, such as "àla". It's almost always intended to be a separate word, as in
# "à la", but when mojibake turns this into "Ã\xa0 la", the two kinds of spaces
# get coalesced into "Ã la".
#
# We make exceptions for the Portuguese words "às", "àquele", "àquela",
# "àquilo" and their plurals -- these are contractions of, for example, "a
# aquele" and are very common. Note that the final letter is important to
# distinguish this case from French "à quel point".
#
# Other instances in Portuguese, such as "àfrica", seem to be typos (intended
# to be "África" with the accent in the other direction).
#
# Unfortunately, "à" is a common letter in Catalan, and mojibake of words that
# contain it will end up with inserted spaces. We can't do the right thing with
# every word. The cost is that the mojibake text "fà cil" will be interpreted as
# "fà cil", not "fàcil".
A_GRAVE_WORD_RE = re . compile ( b " \xc3 (?! |quele|quela|quilo|s ) " )
def restore_byte_a0 ( byts ) :
"""
Some mojibake has been additionally altered by a process that said " hmm,
byte A0 , that ' s basically a space! " and replaced it with an ASCII space.
When the A0 is part of a sequence that we intend to decode as UTF - 8 ,
changing byte A0 to 20 would make it fail to decode .
This process finds sequences that would convincingly decode as UTF - 8 if
byte 20 were changed to A0 , and puts back the A0 . For the purpose of
deciding whether this is a good idea , this step gets a cost of twice
the number of bytes that are changed .
This is used as a step within ` fix_encoding ` .
"""
byts = A_GRAVE_WORD_RE . sub ( b " \xc3 \xa0 " , byts )
def replacement ( match ) :
" The function to apply when this regex matches. "
return match . group ( 0 ) . replace ( b " \x20 " , b " \xa0 " )
return ALTERED_UTF8_RE . sub ( replacement , byts )
def replace_lossy_sequences ( byts ) :
"""
This function identifies sequences where information has been lost in
a " sloppy " codec , indicated by byte 1 A , and if they would otherwise look
like a UTF - 8 sequence , it replaces them with the UTF - 8 sequence for U + FFFD .
A further explanation :
ftfy can now fix text in a few cases that it would previously fix
incompletely , because of the fact that it can ' t successfully apply the fix
to the entire string . A very common case of this is when characters have
been erroneously decoded as windows - 1252 , but instead of the " sloppy "
windows - 1252 that passes through unassigned bytes , the unassigned bytes get
turned into U + FFFD ( <EFBFBD> ) , so we can ' t tell what they were.
This most commonly happens with curly quotation marks that appear
` ` â € œ like this â € <EFBFBD> ` ` .
We can do better by building on ftfy ' s " sloppy codecs " to let them handle
less - sloppy but more - lossy text . When they encounter the character ` ` <EFBFBD> ` ` ,
instead of refusing to encode it , they encode it as byte 1 A - - an
ASCII control code called SUBSTITUTE that once was meant for about the same
purpose . We can then apply a fixer that looks for UTF - 8 sequences where
some continuation bytes have been replaced by byte 1 A , and decode the whole
sequence as <EFBFBD> ; if that doesn ' t work, it ' ll just turn the byte back into <EFBFBD>
itself .
As a result , the above text ` ` â € œ like this â € <EFBFBD> ` ` will decode as
` ` “ like this <EFBFBD> ` ` .
If U + 1 A was actually in the original string , then the sloppy codecs will
not be used , and this function will not be run , so your weird control
character will be left alone but wacky fixes like this won ' t be possible.
This is used as a transcoder within ` fix_encoding ` .
"""
return LOSSY_UTF8_RE . sub ( " \ufffd " . encode ( " utf-8 " ) , byts )
def decode_inconsistent_utf8 ( text ) :
"""
Sometimes , text from one encoding ends up embedded within text from a
different one . This is common enough that we need to be able to fix it .
This is used as a transcoder within ` fix_encoding ` .
"""
def fix_embedded_mojibake ( match ) :
substr = match . group ( 0 )
# Require the match to be shorter, so that this doesn't recurse infinitely
if len ( substr ) < len ( text ) and is_bad ( substr ) :
return ftfy . fix_encoding ( substr )
else :
return substr
return UTF8_DETECTOR_RE . sub ( fix_embedded_mojibake , text )
def _c1_fixer ( match ) :
return match . group ( 0 ) . encode ( " latin-1 " ) . decode ( " sloppy-windows-1252 " )
def fix_c1_controls ( text ) :
"""
If text still contains C1 control characters , treat them as their
Windows - 1252 equivalents . This matches what Web browsers do .
"""
return C1_CONTROL_RE . sub ( _c1_fixer , text )