ftfy : fixes text for you
This is a module for making text less broken . See the ` fix_text ` function
for more information .
import unicodedata
import warnings
from typing import List , NamedTuple , Optional , Tuple , Union
from ftfy import bad_codecs
from ftfy import chardata , fixes
from ftfy . badness import is_bad
from ftfy . formatting import display_ljust
__version__ = " 6.0.3 "
# Though this function does nothing, it lets linters know that we're using
# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more.
bad_codecs . ok ( )
class ExplainedText ( NamedTuple ) :
The return type from ftfy ' s functions that provide an " explanation " of which
steps it applied to fix the text , such as : func : ` fix_and_explain ( ) ` .
When the ' explain ' option is disabled , these functions return the same
type , but the ` explanation ` will be None .
text : str
explanation : Optional [ List [ Tuple [ str , str ] ] ]
class TextFixerConfig ( NamedTuple ) :
r """
A TextFixerConfig object stores configuration options for ftfy .
It ' s implemented as a namedtuple with defaults, so you can instantiate
it by providing the values to change from their defaults as keyword arguments .
For example , to disable ' unescape_html ' and keep the rest of the defaults : :
TextFixerConfig ( unescape_html = False )
Here are the options and their default values :
- ` unescape_html ` : " auto "
Configures whether to replace HTML entities such as & amp ; with the character
they represent . " auto " says to do this by default , but disable it when a
literal < character appears , indicating that the input is actual HTML and
entities should be preserved . The value can be True , to always enable this
fixer , or False , to always disable it .
- ` remove_terminal_escapes ` : True
Removes " ANSI " terminal escapes , such as for changing the color of text in a
terminal window .
- ` fix_encoding ` : True
Detect mojibake and attempt to fix it by decoding the text in a different
encoding standard .
The following four options affect ` fix_encoding ` works , and do nothing if
` fix_encoding ` is False :
- ` restore_byte_a0 ` : True
Allow a literal space ( U + 20 ) to be interpreted as a non - breaking space
( U + A0 ) when that would make it part of a fixable mojibake string .
Because spaces are very common characters , this could lead to false
positives , but we try to apply it only when there ' s strong evidence for
mojibake . Disabling ` restore_byte_a0 ` is safer from false positives ,
but creates false negatives .
- ` replace_lossy_sequences ` : True
Detect mojibake that has been partially replaced by the characters
' <EFBFBD> ' or ' ? ' . If the mojibake could be decoded otherwise , replace the
detected sequence with ' <EFBFBD> ' .
- ` decode_inconsistent_utf8 ` : True
When we see sequences that distinctly look like UTF - 8 mojibake , but
there ' s no consistent way to reinterpret the string in a new encoding,
replace the mojibake with the appropriate UTF - 8 characters anyway .
This helps to decode strings that are concatenated from different
encodings .
- ` fix_c1_controls ` : True
Replace C1 control characters ( the useless characters U + 80 - U + 9 B that
come from Latin - 1 ) with their Windows - 1252 equivalents , like HTML5 does ,
even if the whole string doesn ' t decode as Latin-1.
- ` fix_latin_ligatures ` : True
Replace common Latin - alphabet ligatures , such as ` ` fi ` ` , with the
letters they ' re made of.
- ` fix_character_width ` : True
Replace fullwidth Latin characters and halfwidth Katakana with
their more standard widths .
- ` uncurl_quotes ` : True
Replace curly quotes with straight quotes .
- ` fix_line_breaks ` : True
Replace various forms of line breaks with the standard Unix line
break , ` ` \n ` ` .
- ` fix_surrogates ` : True
Replace sequences of UTF - 16 surrogate codepoints with the character
they were meant to encode . This fixes text that was decoded with the
obsolete UCS - 2 standard , and allows it to support high - numbered
codepoints such as emoji .
- ` remove_control_chars ` : True
Remove certain control characters that have no displayed effect on text .
- ` normalization ` : " NFC "
Choose what kind of Unicode normalization is applied . Usually , we apply
NFC normalization , so that letters followed by combining characters become
single combined characters .
Changing this to " NFKC " applies more compatibility conversions , such as
replacing the ' micro sign ' with a standard Greek lowercase mu , which looks
identical . However , some NFKC normalizations change the meaning of text ,
such as converting " 10³ " to " 103 " .
` normalization ` can be None , to apply no normalization .
- ` max_decode_length ` : 1_000_000
The maximum size of " segment " that ftfy will try to fix all at once .
- ` explain ` : True
Whether to compute ' explanations ' , lists describing what ftfy changed .
When this is False , the explanation will be None , and the code that
builds the explanation will be skipped , possibly saving time .
Functions that accept TextFixerConfig and don ' t return an explanation
will automatically set ` explain ` to False .
unescape_html : Union [ str , bool ] = " auto "
remove_terminal_escapes : bool = True
fix_encoding : bool = True
restore_byte_a0 : bool = True
replace_lossy_sequences : bool = True
decode_inconsistent_utf8 : bool = True
fix_c1_controls : bool = True
fix_latin_ligatures : bool = True
fix_character_width : bool = True
uncurl_quotes : bool = True
fix_line_breaks : bool = True
fix_surrogates : bool = True
remove_control_chars : bool = True
normalization : Optional [ str ] = " NFC "
max_decode_length : int = 1000000
explain : bool = True
def _config_from_kwargs ( config : TextFixerConfig , kwargs : dict ) :
Handle parameters provided as keyword arguments to ftfy ' s top-level
functions , converting them into a TextFixerConfig .
if ' fix_entities ' in kwargs :
warnings . warn (
" `fix_entities` has been renamed to `unescape_html` " ,
kwargs = kwargs . copy ( )
kwargs [ ' unescape_html ' ] = kwargs [ ' fix_entities ' ]
del kwargs [ ' fix_entities ' ]
config = config . _replace ( * * kwargs )
return config
" unescape_html " : fixes . unescape_html ,
" remove_terminal_escapes " : fixes . remove_terminal_escapes ,
" restore_byte_a0 " : fixes . restore_byte_a0 ,
" replace_lossy_sequences " : fixes . replace_lossy_sequences ,
" decode_inconsistent_utf8 " : fixes . decode_inconsistent_utf8 ,
" fix_c1_controls " : fixes . fix_c1_controls ,
" fix_latin_ligatures " : fixes . fix_latin_ligatures ,
" fix_character_width " : fixes . fix_character_width ,
" uncurl_quotes " : fixes . uncurl_quotes ,
" fix_line_breaks " : fixes . fix_line_breaks ,
" fix_surrogates " : fixes . fix_surrogates ,
" remove_control_chars " : fixes . remove_control_chars ,
BYTES_ERROR_TEXT = """ Hey wait, this isn ' t Unicode.
ftfy is designed to fix problems with text . Treating bytes like they ' re
interchangeable with Unicode text is usually something that introduces
problems with text .
You should first decode these bytes from the encoding you think they ' re in.
If you ' re not sure what encoding they ' re in :
- First , try to find out . ' utf-8 ' is a good assumption .
- If the encoding is simply unknowable , try running your bytes through
ftfy . guess_bytes . As the name implies , this may not always be accurate .
For more information on the distinction between bytes and text , read the
Python Unicode HOWTO :
http : / / docs . python . org / 3 / howto / unicode . html
def _try_fix (
fixer_name : str , text : str , config : TextFixerConfig , steps : Optional [ list ]
) - > str :
A helper function used across several ' fixer ' steps , deciding whether to
apply the fix and whether to record the fix in ` steps ` .
if getattr ( config , fixer_name ) :
fixer = FIXERS [ fixer_name ]
fixed = fixer ( text )
if steps is not None and fixed != text :
steps . append ( ( " apply " , fixer_name ) )
return fixed
return text
def fix_text ( text : str , config : Optional [ TextFixerConfig ] = None , * * kwargs ) - > str :
r """
Given Unicode text as input , fix inconsistencies and glitches in it ,
such as mojibake ( text that was decoded in the wrong encoding ) .
Let ' s start with some examples:
>> > fix_text ( ' ✔ No problems ' )
' ✔ No problems '
>> > print ( fix_text ( " ¯ \\ _(ã \x83 \x84 )_/¯ " ) )
¯ \_ ( ツ ) _ / ¯
>> > fix_text ( ' Broken text… it’s flubberific! ' )
" Broken text... it ' s flubberific! "
>> > fix_text ( ' L O U D N O I S E S ' )
ftfy applies a number of different fixes to the text , and can accept
configuration to select which fixes to apply .
The configuration takes the form of a : class : ` TextFixerConfig ` object ,
and you can see a description of the options in that class ' s docstring
or in the full documentation at ftfy . readthedocs . org .
For convenience and backward compatibility , the configuration can also
take the form of keyword arguments , which will set the equivalently - named
fields of the TextFixerConfig object .
For example , here are two ways to fix text but skip the " uncurl_quotes "
step : :
fix_text ( text , TextFixerConfig ( uncurl_quotes = False ) )
fix_text ( text , uncurl_quotes = False )
This function fixes text in independent segments , which are usually lines
of text , or arbitrarily broken up every 1 million codepoints ( configurable
with ` config . max_decode_length ` ) if there aren ' t enough line breaks. The
bound on segment lengths helps to avoid unbounded slowdowns .
ftfy can also provide an ' explanation ' , a list of transformations it applied
to the text that would fix more text like it . This function doesn ' t provide
explanations ( because there may be different fixes for different segments
of text ) .
To get an explanation , use the : func : ` fix_and_explain ( ) ` function , which
fixes the string in one segment and explains what it fixed .
if config is None :
config = TextFixerConfig ( explain = False )
config = _config_from_kwargs ( config , kwargs )
if isinstance ( text , bytes ) :
raise UnicodeError ( BYTES_ERROR_TEXT )
out = [ ]
pos = 0
while pos < len ( text ) :
textbreak = text . find ( " \n " , pos ) + 1
if textbreak == 0 :
textbreak = len ( text )
if ( textbreak - pos ) > config . max_decode_length :
textbreak = pos + config . max_decode_length
segment = text [ pos : textbreak ]
if config . unescape_html == " auto " and " < " in segment :
config = config . _replace ( unescape_html = False )
fixed_segment , _ = fix_and_explain ( segment , config )
out . append ( fixed_segment )
pos = textbreak
return " " . join ( out )
def fix_and_explain (
text : str , config : Optional [ TextFixerConfig ] = None , * * kwargs
) - > ExplainedText :
Fix text as a single segment , returning the fixed text and an explanation
of what was fixed .
The explanation is a list of steps that can be applied with
: func : ` apply_plan ` , or if config . explain is False , it will be None .
if config is None :
config = TextFixerConfig ( )
if isinstance ( text , bytes ) :
raise UnicodeError ( BYTES_ERROR_TEXT )
config = _config_from_kwargs ( config , kwargs )
if config . unescape_html == " auto " and " < " in text :
config = config . _replace ( unescape_html = False )
if config . explain :
steps : Optional [ List [ Tuple [ str , str ] ] ] = [ ]
else :
# If explanations aren't desired, `steps` will be None
steps = None
while True :
origtext = text
text = _try_fix ( " unescape_html " , text , config , steps )
if config . fix_encoding :
if steps is None :
text = fix_encoding ( text )
else :
text , encoding_steps = fix_encoding_and_explain ( text , config )
steps . extend ( encoding_steps )
for fixer in [
" fix_c1_controls " ,
" fix_latin_ligatures " ,
" fix_character_width " ,
" uncurl_quotes " ,
" fix_line_breaks " ,
" fix_surrogates " ,
" remove_terminal_escapes " ,
" remove_control_chars " ,
] :
text = _try_fix ( fixer , text , config , steps )
if config . normalization is not None :
fixed = unicodedata . normalize ( config . normalization , text )
if steps is not None and fixed != text :
steps . append ( ( " normalize " , config . normalization ) )
text = fixed
if text == origtext :
return ExplainedText ( text , steps )
def fix_encoding_and_explain (
text : str , config : Optional [ TextFixerConfig ] = None , * * kwargs
) - > ExplainedText :
Apply the steps of ftfy that detect mojibake and fix it . Returns the fixed
text and a list explaining what was fixed .
This includes fixing text by encoding and decoding it in different encodings ,
as well as the subordinate fixes ` restore_byte_a0 ` , ` replace_lossy_sequences ` ,
` decode_inconsistent_utf8 ` , and ` fix_c1_controls ` .
Examples : :
>> > fix_encoding_and_explain ( " só " )
ExplainedText ( text = ' só ' , explanation = [ ( ' encode ' , ' latin-1 ' ) , ( ' decode ' , ' utf-8 ' ) ] )
>> > result = fix_encoding_and_explain ( " voilà le travail " )
>> > result . text
' voilà le travail '
>> > result . explanation
[ ( ' encode ' , ' latin-1 ' ) , ( ' transcode ' , ' restore_byte_a0 ' ) , ( ' decode ' , ' utf-8 ' ) ]
if config is None :
config = TextFixerConfig ( )
if isinstance ( text , bytes ) :
raise UnicodeError ( BYTES_ERROR_TEXT )
config = _config_from_kwargs ( config , kwargs )
if not config . fix_encoding :
# A weird trivial case: we're asked to fix the encoding, but skip
# fixing the encoding
return ExplainedText ( text , [ ] )
plan_so_far : List [ Tuple [ str , str ] ] = [ ]
while True :
prevtext = text
text , plan = _fix_encoding_one_step_and_explain ( text , config )
plan_so_far . extend ( plan )
if text == prevtext :
return ExplainedText ( text , plan_so_far )
def _fix_encoding_one_step_and_explain (
text : str , config : TextFixerConfig
) - > ExplainedText :
Perform one step of fixing the encoding of text .
if config is None :
config = TextFixerConfig ( )
if len ( text ) == 0 :
return ExplainedText ( text , [ ] )
# The first plan is to return ASCII text unchanged, as well as text
# that doesn't look like it contains mojibake
if chardata . possible_encoding ( text , " ascii " ) or not is_bad ( text ) :
return ExplainedText ( text , [ ] )
# As we go through the next step, remember the possible encodings
# that we encounter but don't successfully fix yet. We may need them
# later.
possible_1byte_encodings = [ ]
# Suppose the text was supposed to be UTF-8, but it was decoded using
# a single-byte encoding instead. When these cases can be fixed, they
# are usually the correct thing to do, so try them next.
for encoding in chardata . CHARMAP_ENCODINGS :
if chardata . possible_encoding ( text , encoding ) :
possible_1byte_encodings . append ( encoding )
encoded_bytes = text . encode ( encoding )
encode_step = ( " encode " , encoding )
transcode_steps = [ ]
# Now, find out if it's UTF-8 (or close enough). Otherwise,
# remember the encoding for later.
try :
decoding = " utf-8 "
# Check encoded_bytes for sequences that would be UTF-8,
# except they have b' ' where b'\xa0' would belong.
if config . restore_byte_a0 and chardata . ALTERED_UTF8_RE . search (
) :
replaced_bytes = fixes . restore_byte_a0 ( encoded_bytes )
if replaced_bytes != encoded_bytes :
transcode_steps . append ( ( " transcode " , " restore_byte_a0 " ) )
encoded_bytes = replaced_bytes
# Replace sequences where information has been lost
if config . replace_lossy_sequences and encoding . startswith ( " sloppy " ) :
replaced_bytes = fixes . replace_lossy_sequences ( encoded_bytes )
if replaced_bytes != encoded_bytes :
transcode_steps . append ( ( " transcode " , " replace_lossy_sequences " ) )
encoded_bytes = replaced_bytes
if 0xED in encoded_bytes or 0xC0 in encoded_bytes :
decoding = " utf-8-variants "
decode_step = ( " decode " , decoding )
steps = [ encode_step ] + transcode_steps + [ decode_step ]
fixed = encoded_bytes . decode ( decoding )
return ExplainedText ( fixed , steps )
except UnicodeDecodeError :
# Look for a-hat-euro sequences that remain, and fix them in isolation.
if config . decode_inconsistent_utf8 and chardata . UTF8_DETECTOR_RE . search ( text ) :
steps = [ ( " apply " , " decode_inconsistent_utf8 " ) ]
fixed = fixes . decode_inconsistent_utf8 ( text )
if fixed != text :
return ExplainedText ( fixed , steps )
# The next most likely case is that this is Latin-1 that was intended to
# be read as Windows-1252, because those two encodings in particular are
# easily confused.
if " latin-1 " in possible_1byte_encodings :
if " windows-1252 " in possible_1byte_encodings :
# This text is in the intersection of Latin-1 and
# Windows-1252, so it's probably legit.
return ExplainedText ( text , [ ] )
else :
# Otherwise, it means we have characters that are in Latin-1 but
# not in Windows-1252. Those are C1 control characters. Nobody
# wants those. Assume they were meant to be Windows-1252.
try :
fixed = text . encode ( " latin-1 " ) . decode ( " windows-1252 " )
if fixed != text :
steps = [ ( " encode " , " latin-1 " ) , ( " decode " , " windows-1252 " ) ]
return ExplainedText ( fixed , steps )
except UnicodeDecodeError :
# Fix individual characters of Latin-1 with a less satisfying explanation
if config . fix_c1_controls and chardata . C1_CONTROL_RE . search ( text ) :
steps = [ ( " transcode " , " fix_c1_controls " ) ]
fixed = fixes . fix_c1_controls ( text )
return ExplainedText ( fixed , steps )
# The cases that remain are mixups between two different single-byte
# encodings, and not the common case of Latin-1 vs. Windows-1252.
# With the new heuristic in 6.0, it's possible that we're closer to solving
# these in some cases. It would require a lot of testing and tuning, though.
# For now, we leave the text unchanged in these cases.
return ExplainedText ( text , [ ] )
def fix_encoding ( text : str , config : TextFixerConfig = None , * * kwargs ) :
Apply just the encoding - fixing steps of ftfy to this text . Returns the
fixed text , discarding the explanation .
>> > fix_encoding ( " ó " )
' ó '
>> > fix_encoding ( " &ATILDE;&SUP3; " )
' &ATILDE;&SUP3; '
if config is None :
config = TextFixerConfig ( explain = False )
config = _config_from_kwargs ( config , kwargs )
fixed , _explan = fix_encoding_and_explain ( text , config )
return fixed
# Some alternate names for the main functions
ftfy = fix_text
def fix_text_segment ( text : str , config : TextFixerConfig = None , * * kwargs ) :
Fix text as a single segment , with a consistent sequence of steps that
are applied to fix the text . Discard the explanation .
if config is None :
config = TextFixerConfig ( explain = False )
config = _config_from_kwargs ( config , kwargs )
fixed , _explan = fix_and_explain ( text , config )
return fixed
def fix_file ( input_file , encoding = None , config = None , * * kwargs ) :
Fix text that is found in a file .
If the file is being read as Unicode text , use that . If it ' s being read as
bytes , then we hope an encoding was supplied . If not , unfortunately , we
have to guess what encoding it is . We ' ll try a few common encodings, but we
make no promises . See the ` guess_bytes ` function for how this is done .
The output is a stream of fixed lines of text .
if config is None :
config = TextFixerConfig ( )
config = _config_from_kwargs ( config , kwargs )
for line in input_file :
if isinstance ( line , bytes ) :
if encoding is None :
line , encoding = guess_bytes ( line )
else :
line = line . decode ( encoding )
if config . unescape_html == " auto " and " < " in line :
config = config . _replace ( unescape_html = False )
fixed_line , _explan = fix_and_explain ( line , config )
yield fixed_line
def guess_bytes ( bstring ) :
NOTE : Using ` guess_bytes ` is not the recommended way of using ftfy . ftfy
is not designed to be an encoding detector .
In the unfortunate situation that you have some bytes in an unknown
encoding , ftfy can guess a reasonable strategy for decoding them , by trying
a few common encodings that can be distinguished from each other .
Unlike the rest of ftfy , this may not be accurate , and it may * create *
Unicode problems instead of solving them !
The encodings we try here are :
- UTF - 16 with a byte order mark , because a UTF - 16 byte order mark looks
like nothing else
- UTF - 8 , because it ' s the global standard, which has been used by a
majority of the Web since 2008
- " utf-8-variants " , or buggy implementations of UTF - 8
- MacRoman , because Microsoft Office thinks it ' s still a thing, and it
can be distinguished by its line breaks . ( If there are no line breaks in
the string , though , you ' re out of luck.)
- " sloppy-windows-1252 " , the Latin - 1 - like encoding that is the most common
single - byte encoding .
if isinstance ( bstring , str ) :
raise UnicodeError (
" This string was already decoded as Unicode. You should pass "
" bytes to guess_bytes, not Unicode. "
if bstring . startswith ( b " \xfe \xff " ) or bstring . startswith ( b " \xff \xfe " ) :
return bstring . decode ( " utf-16 " ) , " utf-16 "
byteset = set ( bstring )
try :
if 0xED in byteset or 0xC0 in byteset :
# Byte 0xed can be used to encode a range of codepoints that
# are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
# so when we see 0xed, it's very likely we're being asked to
# decode CESU-8, the variant that encodes UTF-16 surrogates
# instead of the original characters themselves.
# This will occasionally trigger on standard UTF-8, as there
# are some Korean characters that also use byte 0xed, but that's
# not harmful because standard UTF-8 characters will decode the
# same way in our 'utf-8-variants' codec.
# Byte 0xc0 is impossible because, numerically, it would only
# encode characters lower than U+0040. Those already have
# single-byte representations, and UTF-8 requires using the
# shortest possible representation. However, Java hides the null
# codepoint, U+0000, in a non-standard longer representation -- it
# encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
# will never appear in the encoded bytes.
# The 'utf-8-variants' decoder can handle both of these cases, as
# well as standard UTF-8, at the cost of a bit of speed.
return bstring . decode ( " utf-8-variants " ) , " utf-8-variants "
else :
return bstring . decode ( " utf-8 " ) , " utf-8 "
except UnicodeDecodeError :
if 0x0D in byteset and 0x0A not in byteset :
# Files that contain CR and not LF are likely to be MacRoman.
return bstring . decode ( " macroman " ) , " macroman "
return bstring . decode ( " sloppy-windows-1252 " ) , " sloppy-windows-1252 "
def apply_plan ( text : str , plan : List [ Tuple [ str , str ] ] ) :
Apply a plan for fixing the encoding of text .
The plan is a list of tuples of the form ( operation , arg ) .
` operation ` is one of :
- ` ' encode ' ` : convert a string to bytes , using ` arg ` as the encoding
- ` ' decode ' ` : convert bytes to a string , using ` arg ` as the encoding
- ` ' transcode ' ` : convert bytes to bytes , using the function named ` arg `
- ` ' apply ' ` : convert a string to a string , using the function named ` arg `
The functions that can be applied by ' transcode ' and ' apply ' are
specifically those that appear in the dictionary named ` FIXERS ` . They
can also can be imported from the ` ftfy . fixes ` module .
Example : :
>> > mojibake = " schön "
>> > text , plan = fix_and_explain ( mojibake )
>> > apply_plan ( mojibake , plan )
' schön '
obj = text
for operation , encoding in plan :
if operation == " encode " :
obj = obj . encode ( encoding )
elif operation == " decode " :
obj = obj . decode ( encoding )
elif operation in ( " transcode " , " apply " ) :
if encoding in FIXERS :
obj = FIXERS [ encoding ] ( obj )
else :
raise ValueError ( " Unknown function to apply: %s " % encoding )
else :
raise ValueError ( " Unknown plan step: %s " % operation )
return obj
def explain_unicode ( text : str ) :
A utility method that ' s useful for debugging mysterious Unicode.
It breaks down a string , showing you for each codepoint its number in
hexadecimal , its glyph , its category in the Unicode standard , and its name
in the Unicode standard .
>> > explain_unicode ( ' (╯°□°)╯︵ ┻━┻ ' )
U + 0028 ( [ Ps ] LEFT PARENTHESIS
U + 00 B0 ° [ So ] DEGREE SIGN
U + 25 A1 □ [ So ] WHITE SQUARE
U + 00 B0 ° [ So ] DEGREE SIGN
U + 0020 [ Zs ] SPACE
for char in text :
if char . isprintable ( ) :
display = char
else :
display = char . encode ( " unicode-escape " ) . decode ( " ascii " )
print (
" U+ {code:04X} {display} [ {category} ] {name} " . format (
display = display_ljust ( display , 7 ) ,
code = ord ( char ) ,
category = unicodedata . category ( char ) ,
name = unicodedata . name ( char , " <unknown> " ) ,