bazarr/libs/ftfy/__init__.py

"""
ftfy: fixes text for you

This is a module for making text less broken. See the `fix_text` function
for more information.
"""

import unicodedata
import warnings
from typing import List, NamedTuple, Optional, Tuple, Union, no_type_check

from ftfy import bad_codecs
from ftfy import chardata, fixes
from ftfy.badness import is_bad
from ftfy.formatting import display_ljust

__version__ = "6.1.1"


# Though this function does nothing, it lets linters know that we're using
# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more.
bad_codecs.ok()


class ExplainedText(NamedTuple):
    """
    The return type from ftfy's functions that provide an "explanation" of which
    steps it applied to fix the text, such as :func:`fix_and_explain()`.

    When the 'explain' option is disabled, these functions return the same
    type, but the `explanation` will be None.
    """

    text: str
    explanation: Optional[List[Tuple[str, str]]]


class TextFixerConfig(NamedTuple):
    r"""
    A TextFixerConfig object stores configuration options for ftfy.

    It's implemented as a namedtuple with defaults, so you can instantiate
    it by providing the values to change from their defaults as keyword arguments.
    For example, to disable 'unescape_html' and keep the rest of the defaults::

        TextFixerConfig(unescape_html=False)

    Here are the options and their default values:

    - `unescape_html`: "auto"

      Configures whether to replace HTML entities such as &amp; with the character
      they represent. "auto" says to do this by default, but disable it when a
      literal < character appears, indicating that the input is actual HTML and
      entities should be preserved. The value can be True, to always enable this
      fixer, or False, to always disable it.

    - `remove_terminal_escapes`: True

      Removes "ANSI" terminal escapes, such as for changing the color of text in a
      terminal window.

    - `fix_encoding`: True

      Detect mojibake and attempt to fix it by decoding the text in a different
      encoding standard.

      The following four options affect `fix_encoding` works, and do nothing if
      `fix_encoding` is False:

      - `restore_byte_a0`: True

        Allow a literal space (U+20) to be interpreted as a non-breaking space
        (U+A0) when that would make it part of a fixable mojibake string.

        Because spaces are very common characters, this could lead to false
        positives, but we try to apply it only when there's strong evidence for
        mojibake. Disabling `restore_byte_a0` is safer from false positives,
        but creates false negatives.

      - `replace_lossy_sequences`: True

        Detect mojibake that has been partially replaced by the characters
        '<EFBFBD>' or '?'. If the mojibake could be decoded otherwise, replace the
        detected sequence with '<EFBFBD>'.

      - `decode_inconsistent_utf8`: True

        When we see sequences that distinctly look like UTF-8 mojibake, but
        there's no consistent way to reinterpret the string in a new encoding,
        replace the mojibake with the appropriate UTF-8 characters anyway.

        This helps to decode strings that are concatenated from different
        encodings.

      - `fix_c1_controls`: True

        Replace C1 control characters (the useless characters U+80 - U+9B that
        come from Latin-1) with their Windows-1252 equivalents, like HTML5 does,
        even if the whole string doesn't decode as Latin-1.

    - `fix_latin_ligatures`: True

      Replace common Latin-alphabet ligatures, such as ``ﬁ``, with the
      letters they're made of.

    - `fix_character_width`: True

      Replace fullwidth Latin characters and halfwidth Katakana with
      their more standard widths.

    - `uncurl_quotes`: True

      Replace curly quotes with straight quotes.

    - `fix_line_breaks`: True

      Replace various forms of line breaks with the standard Unix line
      break, ``\n``.

    - `fix_surrogates`: True

      Replace sequences of UTF-16 surrogate codepoints with the character
      they were meant to encode. This fixes text that was decoded with the
      obsolete UCS-2 standard, and allows it to support high-numbered
      codepoints such as emoji.

    - `remove_control_chars`: True

      Remove certain control characters that have no displayed effect on text.

    - `normalization`: "NFC"

      Choose what kind of Unicode normalization is applied. Usually, we apply
      NFC normalization, so that letters followed by combining characters become
      single combined characters.

      Changing this to "NFKC" applies more compatibility conversions, such as
      replacing the 'micro sign' with a standard Greek lowercase mu, which looks
      identical. However, some NFKC normalizations change the meaning of text,
      such as converting "10³" to "103".

    `normalization` can be None, to apply no normalization.

    - `max_decode_length`: 1_000_000

      The maximum size of "segment" that ftfy will try to fix all at once.

    - `explain`: True

      Whether to compute 'explanations', lists describing what ftfy changed.
      When this is False, the explanation will be None, and the code that
      builds the explanation will be skipped, possibly saving time.

      Functions that accept TextFixerConfig and don't return an explanation
      will automatically set `explain` to False.
    """
    unescape_html: Union[str, bool] = "auto"
    remove_terminal_escapes: bool = True
    fix_encoding: bool = True
    restore_byte_a0: bool = True
    replace_lossy_sequences: bool = True
    decode_inconsistent_utf8: bool = True
    fix_c1_controls: bool = True
    fix_latin_ligatures: bool = True
    fix_character_width: bool = True
    uncurl_quotes: bool = True
    fix_line_breaks: bool = True
    fix_surrogates: bool = True
    remove_control_chars: bool = True
    normalization: Optional[str] = "NFC"
    max_decode_length: int = 1000000
    explain: bool = True


def _config_from_kwargs(config: TextFixerConfig, kwargs: dict) -> TextFixerConfig:
    """
    Handle parameters provided as keyword arguments to ftfy's top-level
    functions, converting them into a TextFixerConfig.
    """
    if "fix_entities" in kwargs:
        warnings.warn(
            "`fix_entities` has been renamed to `unescape_html`", DeprecationWarning
        )
        kwargs = kwargs.copy()
        kwargs["unescape_html"] = kwargs["fix_entities"]
        del kwargs["fix_entities"]
    config = config._replace(**kwargs)
    return config


FIXERS = {
    "unescape_html": fixes.unescape_html,
    "remove_terminal_escapes": fixes.remove_terminal_escapes,
    "restore_byte_a0": fixes.restore_byte_a0,
    "replace_lossy_sequences": fixes.replace_lossy_sequences,
    "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8,
    "fix_c1_controls": fixes.fix_c1_controls,
    "fix_latin_ligatures": fixes.fix_latin_ligatures,
    "fix_character_width": fixes.fix_character_width,
    "uncurl_quotes": fixes.uncurl_quotes,
    "fix_line_breaks": fixes.fix_line_breaks,
    "fix_surrogates": fixes.fix_surrogates,
    "remove_control_chars": fixes.remove_control_chars,
}


BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.

ftfy is designed to fix problems with text. Treating bytes like they're
interchangeable with Unicode text is usually something that introduces
problems with text.

You should first decode these bytes from the encoding you think they're in.
If you're not sure what encoding they're in:

- First, try to find out. 'utf-8' is a good assumption.
- If the encoding is simply unknowable, try running your bytes through
  ftfy.guess_bytes. As the name implies, this may not always be accurate.

For more information on the distinction between bytes and text, read the
Python Unicode HOWTO:

    http://docs.python.org/3/howto/unicode.html
"""


def _try_fix(
    fixer_name: str, text: str, config: TextFixerConfig, steps: Optional[list]
) -> str:
    """
    A helper function used across several 'fixer' steps, deciding whether to
    apply the fix and whether to record the fix in `steps`.
    """
    if getattr(config, fixer_name):
        fixer = FIXERS[fixer_name]
        fixed = fixer(text)
        if steps is not None and fixed != text:
            steps.append(("apply", fixer_name))
        return fixed

    return text


def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str:
    r"""
    Given Unicode text as input, fix inconsistencies and glitches in it,
    such as mojibake (text that was decoded in the wrong encoding).

    Let's start with some examples:

        >>> fix_text('âœ” No problems')
        '✔ No problems'

        >>> print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;"))
        ¯\_(ツ)_/¯

        >>> fix_text('Broken text&hellip; it&#x2019;s ﬂubberiﬁc!')
        "Broken text... it's flubberific!"

        >>> fix_text('ＬＯＵＤ　ＮＯＩＳＥＳ')
        'LOUD NOISES'

    ftfy applies a number of different fixes to the text, and can accept
    configuration to select which fixes to apply.

    The configuration takes the form of a :class:`TextFixerConfig` object,
    and you can see a description of the options in that class's docstring
    or in the full documentation at ftfy.readthedocs.org.

    For convenience and backward compatibility, the configuration can also
    take the form of keyword arguments, which will set the equivalently-named
    fields of the TextFixerConfig object.

    For example, here are two ways to fix text but skip the "uncurl_quotes"
    step::

        fix_text(text, TextFixerConfig(uncurl_quotes=False))
        fix_text(text, uncurl_quotes=False)

    This function fixes text in independent segments, which are usually lines
    of text, or arbitrarily broken up every 1 million codepoints (configurable
    with `config.max_decode_length`) if there aren't enough line breaks. The
    bound on segment lengths helps to avoid unbounded slowdowns.

    ftfy can also provide an 'explanation', a list of transformations it applied
    to the text that would fix more text like it. This function doesn't provide
    explanations (because there may be different fixes for different segments
    of text).

    To get an explanation, use the :func:`fix_and_explain()` function, which
    fixes the string in one segment and explains what it fixed.
    """

    if config is None:
        config = TextFixerConfig(explain=False)
    config = _config_from_kwargs(config, kwargs)
    if isinstance(text, bytes):
        raise UnicodeError(BYTES_ERROR_TEXT)

    out = []
    pos = 0
    while pos < len(text):
        textbreak = text.find("\n", pos) + 1
        if textbreak == 0:
            textbreak = len(text)
        if (textbreak - pos) > config.max_decode_length:
            textbreak = pos + config.max_decode_length

        segment = text[pos:textbreak]
        if config.unescape_html == "auto" and "<" in segment:
            config = config._replace(unescape_html=False)
        fixed_segment, _ = fix_and_explain(segment, config)
        out.append(fixed_segment)
        pos = textbreak
    return "".join(out)


def fix_and_explain(
    text: str, config: Optional[TextFixerConfig] = None, **kwargs
) -> ExplainedText:
    """
    Fix text as a single segment, returning the fixed text and an explanation
    of what was fixed.

    The explanation is a list of steps that can be applied with
    :func:`apply_plan`, or if config.explain is False, it will be None.
    """
    if config is None:
        config = TextFixerConfig()
    if isinstance(text, bytes):
        raise UnicodeError(BYTES_ERROR_TEXT)
    config = _config_from_kwargs(config, kwargs)

    if config.unescape_html == "auto" and "<" in text:
        config = config._replace(unescape_html=False)

    if config.explain:
        steps: Optional[List[Tuple[str, str]]] = []
    else:
        # If explanations aren't desired, `steps` will be None
        steps = None

    while True:
        origtext = text

        text = _try_fix("unescape_html", text, config, steps)

        if config.fix_encoding:
            if steps is None:
                text = fix_encoding(text)
            else:
                text, encoding_steps = fix_encoding_and_explain(text, config)
                if encoding_steps is not None:
                    steps.extend(encoding_steps)

        for fixer in [
            "fix_c1_controls",
            "fix_latin_ligatures",
            "fix_character_width",
            "uncurl_quotes",
            "fix_line_breaks",
            "fix_surrogates",
            "remove_terminal_escapes",
            "remove_control_chars",
        ]:
            text = _try_fix(fixer, text, config, steps)

        if config.normalization is not None:
            fixed = unicodedata.normalize(config.normalization, text)
            if steps is not None and fixed != text:
                steps.append(("normalize", config.normalization))
            text = fixed

        if text == origtext:
            return ExplainedText(text, steps)


def fix_encoding_and_explain(
    text: str, config: Optional[TextFixerConfig] = None, **kwargs
) -> ExplainedText:
    """
    Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed
    text and a list explaining what was fixed.

    This includes fixing text by encoding and decoding it in different encodings,
    as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`,
    `decode_inconsistent_utf8`, and `fix_c1_controls`.

    Examples::

        >>> fix_encoding_and_explain("sÃ³")
        ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')])

        >>> result = fix_encoding_and_explain("voilÃ le travail")
        >>> result.text
        'voilà le travail'
        >>> result.explanation
        [('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')]

    """
    if config is None:
        config = TextFixerConfig()
    if isinstance(text, bytes):
        raise UnicodeError(BYTES_ERROR_TEXT)
    config = _config_from_kwargs(config, kwargs)

    if not config.fix_encoding:
        # A weird trivial case: we're asked to fix the encoding, but skip
        # fixing the encoding
        return ExplainedText(text, [])

    plan_so_far: List[Tuple[str, str]] = []
    while True:
        prevtext = text
        text, plan = _fix_encoding_one_step_and_explain(text, config)
        if plan is not None:
            plan_so_far.extend(plan)
        if text == prevtext:
            return ExplainedText(text, plan_so_far)


def _fix_encoding_one_step_and_explain(
    text: str, config: TextFixerConfig
) -> ExplainedText:
    """
    Perform one step of fixing the encoding of text.
    """
    if config is None:
        config = TextFixerConfig()

    if len(text) == 0:
        return ExplainedText(text, [])

    # The first plan is to return ASCII text unchanged, as well as text
    # that doesn't look like it contains mojibake
    if chardata.possible_encoding(text, "ascii") or not is_bad(text):
        return ExplainedText(text, [])

    # As we go through the next step, remember the possible encodings
    # that we encounter but don't successfully fix yet. We may need them
    # later.
    possible_1byte_encodings = []

    # Suppose the text was supposed to be UTF-8, but it was decoded using
    # a single-byte encoding instead. When these cases can be fixed, they
    # are usually the correct thing to do, so try them next.
    for encoding in chardata.CHARMAP_ENCODINGS:
        if chardata.possible_encoding(text, encoding):
            possible_1byte_encodings.append(encoding)
            encoded_bytes = text.encode(encoding)
            encode_step = ("encode", encoding)
            transcode_steps = []

            # Now, find out if it's UTF-8 (or close enough). Otherwise,
            # remember the encoding for later.
            try:
                decoding = "utf-8"
                # Check encoded_bytes for sequences that would be UTF-8,
                # except they have b' ' where b'\xa0' would belong.
                if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search(
                    encoded_bytes
                ):
                    replaced_bytes = fixes.restore_byte_a0(encoded_bytes)
                    if replaced_bytes != encoded_bytes:
                        transcode_steps.append(("transcode", "restore_byte_a0"))
                        encoded_bytes = replaced_bytes

                # Replace sequences where information has been lost
                if config.replace_lossy_sequences and encoding.startswith("sloppy"):
                    replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes)
                    if replaced_bytes != encoded_bytes:
                        transcode_steps.append(("transcode", "replace_lossy_sequences"))
                        encoded_bytes = replaced_bytes

                if 0xED in encoded_bytes or 0xC0 in encoded_bytes:
                    decoding = "utf-8-variants"

                decode_step = ("decode", decoding)
                steps = [encode_step] + transcode_steps + [decode_step]
                fixed = encoded_bytes.decode(decoding)
                return ExplainedText(fixed, steps)

            except UnicodeDecodeError:
                pass

    # Look for a-hat-euro sequences that remain, and fix them in isolation.
    if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text):
        steps = [("apply", "decode_inconsistent_utf8")]
        fixed = fixes.decode_inconsistent_utf8(text)
        if fixed != text:
            return ExplainedText(fixed, steps)

    # The next most likely case is that this is Latin-1 that was intended to
    # be read as Windows-1252, because those two encodings in particular are
    # easily confused.
    if "latin-1" in possible_1byte_encodings:
        if "windows-1252" in possible_1byte_encodings:
            # This text is in the intersection of Latin-1 and
            # Windows-1252, so it's probably legit.
            return ExplainedText(text, [])
        else:
            # Otherwise, it means we have characters that are in Latin-1 but
            # not in Windows-1252. Those are C1 control characters. Nobody
            # wants those. Assume they were meant to be Windows-1252.
            try:
                fixed = text.encode("latin-1").decode("windows-1252")
                if fixed != text:
                    steps = [("encode", "latin-1"), ("decode", "windows-1252")]
                    return ExplainedText(fixed, steps)
            except UnicodeDecodeError:
                pass

    # Fix individual characters of Latin-1 with a less satisfying explanation
    if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text):
        steps = [("transcode", "fix_c1_controls")]
        fixed = fixes.fix_c1_controls(text)
        return ExplainedText(fixed, steps)

    # The cases that remain are mixups between two different single-byte
    # encodings, and not the common case of Latin-1 vs. Windows-1252.
    #
    # With the new heuristic in 6.0, it's possible that we're closer to solving
    # these in some cases. It would require a lot of testing and tuning, though.
    # For now, we leave the text unchanged in these cases.
    return ExplainedText(text, [])


def fix_encoding(text: str, config: TextFixerConfig = None, **kwargs):
    """
    Apply just the encoding-fixing steps of ftfy to this text. Returns the
    fixed text, discarding the explanation.

        >>> fix_encoding("Ã³")
        'ó'
        >>> fix_encoding("&ATILDE;&SUP3;")
        '&ATILDE;&SUP3;'
    """
    if config is None:
        config = TextFixerConfig(explain=False)
    config = _config_from_kwargs(config, kwargs)
    fixed, _explan = fix_encoding_and_explain(text, config)
    return fixed


# Some alternate names for the main functions
ftfy = fix_text


def fix_text_segment(text: str, config: TextFixerConfig = None, **kwargs):
    """
    Fix text as a single segment, with a consistent sequence of steps that
    are applied to fix the text. Discard the explanation.
    """
    if config is None:
        config = TextFixerConfig(explain=False)
    config = _config_from_kwargs(config, kwargs)
    fixed, _explan = fix_and_explain(text, config)
    return fixed


def fix_file(input_file, encoding=None, config=None, **kwargs):
    """
    Fix text that is found in a file.

    If the file is being read as Unicode text, use that. If it's being read as
    bytes, then we hope an encoding was supplied. If not, unfortunately, we
    have to guess what encoding it is. We'll try a few common encodings, but we
    make no promises. See the `guess_bytes` function for how this is done.

    The output is a stream of fixed lines of text.
    """
    if config is None:
        config = TextFixerConfig()
    config = _config_from_kwargs(config, kwargs)

    for line in input_file:
        if isinstance(line, bytes):
            if encoding is None:
                line, encoding = guess_bytes(line)
            else:
                line = line.decode(encoding)
        if config.unescape_html == "auto" and "<" in line:
            config = config._replace(unescape_html=False)

        fixed_line, _explan = fix_and_explain(line, config)
        yield fixed_line


def guess_bytes(bstring):
    """
    NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy
    is not designed to be an encoding detector.

    In the unfortunate situation that you have some bytes in an unknown
    encoding, ftfy can guess a reasonable strategy for decoding them, by trying
    a few common encodings that can be distinguished from each other.

    Unlike the rest of ftfy, this may not be accurate, and it may *create*
    Unicode problems instead of solving them!

    The encodings we try here are:

    - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
      like nothing else
    - UTF-8, because it's the global standard, which has been used by a
      majority of the Web since 2008
    - "utf-8-variants", or buggy implementations of UTF-8
    - MacRoman, because Microsoft Office thinks it's still a thing, and it
      can be distinguished by its line breaks. (If there are no line breaks in
      the string, though, you're out of luck.)
    - "sloppy-windows-1252", the Latin-1-like encoding that is the most common
      single-byte encoding.
    """
    if isinstance(bstring, str):
        raise UnicodeError(
            "This string was already decoded as Unicode. You should pass "
            "bytes to guess_bytes, not Unicode."
        )

    if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"):
        return bstring.decode("utf-16"), "utf-16"

    byteset = set(bstring)
    try:
        if 0xED in byteset or 0xC0 in byteset:
            # Byte 0xed can be used to encode a range of codepoints that
            # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
            # so when we see 0xed, it's very likely we're being asked to
            # decode CESU-8, the variant that encodes UTF-16 surrogates
            # instead of the original characters themselves.
            #
            # This will occasionally trigger on standard UTF-8, as there
            # are some Korean characters that also use byte 0xed, but that's
            # not harmful because standard UTF-8 characters will decode the
            # same way in our 'utf-8-variants' codec.
            #
            # Byte 0xc0 is impossible because, numerically, it would only
            # encode characters lower than U+0040. Those already have
            # single-byte representations, and UTF-8 requires using the
            # shortest possible representation. However, Java hides the null
            # codepoint, U+0000, in a non-standard longer representation -- it
            # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
            # will never appear in the encoded bytes.
            #
            # The 'utf-8-variants' decoder can handle both of these cases, as
            # well as standard UTF-8, at the cost of a bit of speed.
            return bstring.decode("utf-8-variants"), "utf-8-variants"
        else:
            return bstring.decode("utf-8"), "utf-8"
    except UnicodeDecodeError:
        pass

    if 0x0D in byteset and 0x0A not in byteset:
        # Files that contain CR and not LF are likely to be MacRoman.
        return bstring.decode("macroman"), "macroman"

    return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252"


@no_type_check
def apply_plan(text: str, plan: List[Tuple[str, str]]):
    """
    Apply a plan for fixing the encoding of text.

    The plan is a list of tuples of the form (operation, arg).

    `operation` is one of:

    - `'encode'`: convert a string to bytes, using `arg` as the encoding
    - `'decode'`: convert bytes to a string, using `arg` as the encoding
    - `'transcode'`: convert bytes to bytes, using the function named `arg`
    - `'apply'`: convert a string to a string, using the function named `arg`

    The functions that can be applied by 'transcode' and 'apply' are
    specifically those that appear in the dictionary named `FIXERS`. They
    can also can be imported from the `ftfy.fixes` module.

    Example::

        >>> mojibake = "schÃ¶n"
        >>> text, plan = fix_and_explain(mojibake)
        >>> apply_plan(mojibake, plan)
        'schön'
    """
    obj = text
    for operation, encoding in plan:
        if operation == "encode":
            obj = obj.encode(encoding)
        elif operation == "decode":
            obj = obj.decode(encoding)
        elif operation in ("transcode", "apply"):
            if encoding in FIXERS:
                obj = FIXERS[encoding](obj)
            else:
                raise ValueError("Unknown function to apply: %s" % encoding)
        else:
            raise ValueError("Unknown plan step: %s" % operation)

    return obj


def explain_unicode(text: str):
    """
    A utility method that's useful for debugging mysterious Unicode.

    It breaks down a string, showing you for each codepoint its number in
    hexadecimal, its glyph, its category in the Unicode standard, and its name
    in the Unicode standard.

        >>> explain_unicode('(╯°□°)╯︵ ┻━┻')
        U+0028  (       [Ps] LEFT PARENTHESIS
        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
        U+00B0  °       [So] DEGREE SIGN
        U+25A1  □       [So] WHITE SQUARE
        U+00B0  °       [So] DEGREE SIGN
        U+0029  )       [Pe] RIGHT PARENTHESIS
        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
        U+FE35  ︵      [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
        U+0020          [Zs] SPACE
        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
        U+2501  ━       [So] BOX DRAWINGS HEAVY HORIZONTAL
        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
    """
    for char in text:
        if char.isprintable():
            display = char
        else:
            display = char.encode("unicode-escape").decode("ascii")
        print(
            "U+{code:04X}  {display} [{category}] {name}".format(
                display=display_ljust(display, 7),
                code=ord(char),
                category=unicodedata.category(char),
                name=unicodedata.name(char, "<unknown>"),
            )
        )
-												update deps

											
										
										
											6 years ago
+								"""
 								ftfy: fixes text for you
 								This is a module for making text less broken. See the `fix_text` function
 								for more information.
 								"""
 								import unicodedata
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								import warnings
-												Updated vendored dependencies.

											
										
										
											2 years ago
+								from typing import List, NamedTuple, Optional, Tuple, Union, no_type_check
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
 								from ftfy import bad_codecs
 								from ftfy import chardata, fixes
 								from ftfy.badness import is_bad
-												update deps

											
										
										
											6 years ago
+								from ftfy.formatting import display_ljust
-												Updated vendored dependencies.

											
										
										
											2 years ago
+								__version__ = "6.1.1"
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
 								# Though this function does nothing, it lets linters know that we're using
 								# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more.
 								bad_codecs.ok()
-												update deps

											
										
										
											6 years ago
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								class ExplainedText(NamedTuple):
 								    """
 								    The return type from ftfy's functions that provide an "explanation" of which
 								    steps it applied to fix the text, such as :func:`fix_and_explain()`.
 								    When the 'explain' option is disabled, these functions return the same
 								    type, but the `explanation` will be None.
 								    """
-												Updated vendored dependencies.

											
										
										
											2 years ago
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    text: str
 								    explanation: Optional[List[Tuple[str, str]]]
-												update deps

											
										
										
											6 years ago
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								class TextFixerConfig(NamedTuple):
-												update deps

											
										
										
											6 years ago
+								    r"""
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    A TextFixerConfig object stores configuration options for ftfy.
-												update deps

											
										
										
											6 years ago
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    It's implemented as a namedtuple with defaults, so you can instantiate
 								    it by providing the values to change from their defaults as keyword arguments.
 								    For example, to disable 'unescape_html' and keep the rest of the defaults::
 								        TextFixerConfig(unescape_html=False)
 								    Here are the options and their default values:
 								    - `unescape_html`: "auto"
 								      Configures whether to replace HTML entities such as &amp; with the character
 								      they represent. "auto" says to do this by default, but disable it when a
 								      literal < character appears, indicating that the input is actual HTML and
 								      entities should be preserved. The value can be True, to always enable this
 								      fixer, or False, to always disable it.
 								    - `remove_terminal_escapes`: True
 								      Removes "ANSI" terminal escapes, such as for changing the color of text in a
 								      terminal window.
 								    - `fix_encoding`: True
 								      Detect mojibake and attempt to fix it by decoding the text in a different
 								      encoding standard.
 								      The following four options affect `fix_encoding` works, and do nothing if
 								      `fix_encoding` is False:
 								      - `restore_byte_a0`: True
 								        Allow a literal space (U+20) to be interpreted as a non-breaking space
 								        (U+A0) when that would make it part of a fixable mojibake string.
 								        Because spaces are very common characters, this could lead to false
 								        positives, but we try to apply it only when there's strong evidence for
 								        mojibake. Disabling `restore_byte_a0` is safer from false positives,
 								        but creates false negatives.
 								      - `replace_lossy_sequences`: True
 								        Detect mojibake that has been partially replaced by the characters
 								        '<EFBFBD>' or '?'. If the mojibake could be decoded otherwise, replace the
 								        detected sequence with '<EFBFBD>'.
 								      - `decode_inconsistent_utf8`: True
 								        When we see sequences that distinctly look like UTF-8 mojibake, but
 								        there's no consistent way to reinterpret the string in a new encoding,
 								        replace the mojibake with the appropriate UTF-8 characters anyway.
 								        This helps to decode strings that are concatenated from different
 								        encodings.
 								      - `fix_c1_controls`: True
 								        Replace C1 control characters (the useless characters U+80 - U+9B that
 								        come from Latin-1) with their Windows-1252 equivalents, like HTML5 does,
 								        even if the whole string doesn't decode as Latin-1.
 								    - `fix_latin_ligatures`: True
 								      Replace common Latin-alphabet ligatures, such as ``ﬁ``, with the
 								      letters they're made of.
 								    - `fix_character_width`: True
 								      Replace fullwidth Latin characters and halfwidth Katakana with
 								      their more standard widths.
 								    - `uncurl_quotes`: True
 								      Replace curly quotes with straight quotes.
 								    - `fix_line_breaks`: True
 								      Replace various forms of line breaks with the standard Unix line
 								      break, ``\n``.
 								    - `fix_surrogates`: True
 								      Replace sequences of UTF-16 surrogate codepoints with the character
 								      they were meant to encode. This fixes text that was decoded with the
 								      obsolete UCS-2 standard, and allows it to support high-numbered
 								      codepoints such as emoji.
-												update deps

											
										
										
											6 years ago
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    - `remove_control_chars`: True
-												update deps

											
										
										
											6 years ago
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								      Remove certain control characters that have no displayed effect on text.
-												update deps

											
										
										
											6 years ago
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    - `normalization`: "NFC"
-												update deps

											
										
										
											6 years ago
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								      Choose what kind of Unicode normalization is applied. Usually, we apply
 								      NFC normalization, so that letters followed by combining characters become
 								      single combined characters.
 								      Changing this to "NFKC" applies more compatibility conversions, such as
 								      replacing the 'micro sign' with a standard Greek lowercase mu, which looks
 								      identical. However, some NFKC normalizations change the meaning of text,
 								      such as converting "10³" to "103".
 								    `normalization` can be None, to apply no normalization.
 								    - `max_decode_length`: 1_000_000
 								      The maximum size of "segment" that ftfy will try to fix all at once.
 								    - `explain`: True
 								      Whether to compute 'explanations', lists describing what ftfy changed.
 								      When this is False, the explanation will be None, and the code that
 								      builds the explanation will be skipped, possibly saving time.
 								      Functions that accept TextFixerConfig and don't return an explanation
 								      will automatically set `explain` to False.
 								    """
 								    unescape_html: Union[str, bool] = "auto"
 								    remove_terminal_escapes: bool = True
 								    fix_encoding: bool = True
 								    restore_byte_a0: bool = True
 								    replace_lossy_sequences: bool = True
 								    decode_inconsistent_utf8: bool = True
 								    fix_c1_controls: bool = True
 								    fix_latin_ligatures: bool = True
 								    fix_character_width: bool = True
 								    uncurl_quotes: bool = True
 								    fix_line_breaks: bool = True
 								    fix_surrogates: bool = True
 								    remove_control_chars: bool = True
 								    normalization: Optional[str] = "NFC"
 								    max_decode_length: int = 1000000
 								    explain: bool = True
-												Updated vendored dependencies.

											
										
										
											2 years ago
+								def _config_from_kwargs(config: TextFixerConfig, kwargs: dict) -> TextFixerConfig:
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    """
 								    Handle parameters provided as keyword arguments to ftfy's top-level
 								    functions, converting them into a TextFixerConfig.
 								    """
-												Updated vendored dependencies.

											
										
										
											2 years ago
+								    if "fix_entities" in kwargs:
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								        warnings.warn(
-												Updated vendored dependencies.

											
										
										
											2 years ago
+								            "`fix_entities` has been renamed to `unescape_html`", DeprecationWarning
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								        )
 								        kwargs = kwargs.copy()
-												Updated vendored dependencies.

											
										
										
											2 years ago
+								        kwargs["unescape_html"] = kwargs["fix_entities"]
 								        del kwargs["fix_entities"]
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    config = config._replace(**kwargs)
 								    return config
 								FIXERS = {
 								    "unescape_html": fixes.unescape_html,
 								    "remove_terminal_escapes": fixes.remove_terminal_escapes,
 								    "restore_byte_a0": fixes.restore_byte_a0,
 								    "replace_lossy_sequences": fixes.replace_lossy_sequences,
 								    "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8,
 								    "fix_c1_controls": fixes.fix_c1_controls,
 								    "fix_latin_ligatures": fixes.fix_latin_ligatures,
 								    "fix_character_width": fixes.fix_character_width,
 								    "uncurl_quotes": fixes.uncurl_quotes,
 								    "fix_line_breaks": fixes.fix_line_breaks,
 								    "fix_surrogates": fixes.fix_surrogates,
 								    "remove_control_chars": fixes.remove_control_chars,
 								}
 								BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
 								ftfy is designed to fix problems with text. Treating bytes like they're
 								interchangeable with Unicode text is usually something that introduces
 								problems with text.
 								You should first decode these bytes from the encoding you think they're in.
 								If you're not sure what encoding they're in:
 								- First, try to find out. 'utf-8' is a good assumption.
 								- If the encoding is simply unknowable, try running your bytes through
 								  ftfy.guess_bytes. As the name implies, this may not always be accurate.
 								For more information on the distinction between bytes and text, read the
 								Python Unicode HOWTO:
 								    http://docs.python.org/3/howto/unicode.html
 								"""
-												Updated vendored dependencies.

											
										
										
											2 years ago
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								def _try_fix(
 								    fixer_name: str, text: str, config: TextFixerConfig, steps: Optional[list]
 								) -> str:
 								    """
 								    A helper function used across several 'fixer' steps, deciding whether to
 								    apply the fix and whether to record the fix in `steps`.
 								    """
 								    if getattr(config, fixer_name):
 								        fixer = FIXERS[fixer_name]
 								        fixed = fixer(text)
 								        if steps is not None and fixed != text:
 								            steps.append(("apply", fixer_name))
 								        return fixed
 								    return text
 								def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str:
 								    r"""
 								    Given Unicode text as input, fix inconsistencies and glitches in it,
 								    such as mojibake (text that was decoded in the wrong encoding).
 								    Let's start with some examples:
 								        >>> fix_text('âœ” No problems')
 								        '✔ No problems'
-												update deps

											
										
										
											6 years ago
 								        >>> print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;"))
 								        ¯\_(ツ)_/¯
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								        >>> fix_text('Broken text&hellip; it&#x2019;s ﬂubberiﬁc!')
 								        "Broken text... it's flubberific!"
 								        >>> fix_text('ＬＯＵＤ　ＮＯＩＳＥＳ')
 								        'LOUD NOISES'
 								    ftfy applies a number of different fixes to the text, and can accept
 								    configuration to select which fixes to apply.
 								    The configuration takes the form of a :class:`TextFixerConfig` object,
 								    and you can see a description of the options in that class's docstring
 								    or in the full documentation at ftfy.readthedocs.org.
 								    For convenience and backward compatibility, the configuration can also
 								    take the form of keyword arguments, which will set the equivalently-named
 								    fields of the TextFixerConfig object.
 								    For example, here are two ways to fix text but skip the "uncurl_quotes"
 								    step::
 								        fix_text(text, TextFixerConfig(uncurl_quotes=False))
 								        fix_text(text, uncurl_quotes=False)
 								    This function fixes text in independent segments, which are usually lines
 								    of text, or arbitrarily broken up every 1 million codepoints (configurable
 								    with `config.max_decode_length`) if there aren't enough line breaks. The
 								    bound on segment lengths helps to avoid unbounded slowdowns.
 								    ftfy can also provide an 'explanation', a list of transformations it applied
 								    to the text that would fix more text like it. This function doesn't provide
 								    explanations (because there may be different fixes for different segments
 								    of text).
 								    To get an explanation, use the :func:`fix_and_explain()` function, which
 								    fixes the string in one segment and explains what it fixed.
-												update deps

											
										
										
											6 years ago
+								    """
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
 								    if config is None:
 								        config = TextFixerConfig(explain=False)
 								    config = _config_from_kwargs(config, kwargs)
-												update deps

											
										
										
											6 years ago
+								    if isinstance(text, bytes):
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								        raise UnicodeError(BYTES_ERROR_TEXT)
-												update deps

											
										
										
											6 years ago
 								    out = []
 								    pos = 0
 								    while pos < len(text):
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								        textbreak = text.find("\n", pos) + 1
-												update deps

											
										
										
											6 years ago
+								        if textbreak == 0:
 								            textbreak = len(text)
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								        if (textbreak - pos) > config.max_decode_length:
 								            textbreak = pos + config.max_decode_length
 								        segment = text[pos:textbreak]
 								        if config.unescape_html == "auto" and "<" in segment:
 								            config = config._replace(unescape_html=False)
 								        fixed_segment, _ = fix_and_explain(segment, config)
 								        out.append(fixed_segment)
-												update deps

											
										
										
											6 years ago
+								        pos = textbreak
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    return "".join(out)
 								def fix_and_explain(
 								    text: str, config: Optional[TextFixerConfig] = None, **kwargs
 								) -> ExplainedText:
 								    """
 								    Fix text as a single segment, returning the fixed text and an explanation
 								    of what was fixed.
 								    The explanation is a list of steps that can be applied with
 								    :func:`apply_plan`, or if config.explain is False, it will be None.
 								    """
 								    if config is None:
 								        config = TextFixerConfig()
 								    if isinstance(text, bytes):
 								        raise UnicodeError(BYTES_ERROR_TEXT)
 								    config = _config_from_kwargs(config, kwargs)
 								    if config.unescape_html == "auto" and "<" in text:
 								        config = config._replace(unescape_html=False)
 								    if config.explain:
 								        steps: Optional[List[Tuple[str, str]]] = []
 								    else:
 								        # If explanations aren't desired, `steps` will be None
 								        steps = None
 								    while True:
 								        origtext = text
 								        text = _try_fix("unescape_html", text, config, steps)
 								        if config.fix_encoding:
 								            if steps is None:
 								                text = fix_encoding(text)
 								            else:
 								                text, encoding_steps = fix_encoding_and_explain(text, config)
-												Updated vendored dependencies.

											
										
										
											2 years ago
+								                if encoding_steps is not None:
 								                    steps.extend(encoding_steps)
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
 								        for fixer in [
 								            "fix_c1_controls",
 								            "fix_latin_ligatures",
 								            "fix_character_width",
 								            "uncurl_quotes",
 								            "fix_line_breaks",
 								            "fix_surrogates",
 								            "remove_terminal_escapes",
 								            "remove_control_chars",
 								        ]:
 								            text = _try_fix(fixer, text, config, steps)
 								        if config.normalization is not None:
 								            fixed = unicodedata.normalize(config.normalization, text)
 								            if steps is not None and fixed != text:
 								                steps.append(("normalize", config.normalization))
 								            text = fixed
 								        if text == origtext:
 								            return ExplainedText(text, steps)
 								def fix_encoding_and_explain(
 								    text: str, config: Optional[TextFixerConfig] = None, **kwargs
 								) -> ExplainedText:
 								    """
 								    Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed
 								    text and a list explaining what was fixed.
 								    This includes fixing text by encoding and decoding it in different encodings,
 								    as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`,
 								    `decode_inconsistent_utf8`, and `fix_c1_controls`.
 								    Examples::
 								        >>> fix_encoding_and_explain("sÃ³")
 								        ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')])
 								        >>> result = fix_encoding_and_explain("voilÃ le travail")
 								        >>> result.text
 								        'voilà le travail'
 								        >>> result.explanation
 								        [('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')]
 								    """
 								    if config is None:
 								        config = TextFixerConfig()
 								    if isinstance(text, bytes):
 								        raise UnicodeError(BYTES_ERROR_TEXT)
 								    config = _config_from_kwargs(config, kwargs)
 								    if not config.fix_encoding:
 								        # A weird trivial case: we're asked to fix the encoding, but skip
 								        # fixing the encoding
 								        return ExplainedText(text, [])
 								    plan_so_far: List[Tuple[str, str]] = []
 								    while True:
 								        prevtext = text
 								        text, plan = _fix_encoding_one_step_and_explain(text, config)
-												Updated vendored dependencies.

											
										
										
											2 years ago
+								        if plan is not None:
 								            plan_so_far.extend(plan)
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								        if text == prevtext:
 								            return ExplainedText(text, plan_so_far)
 								def _fix_encoding_one_step_and_explain(
 								    text: str, config: TextFixerConfig
 								) -> ExplainedText:
 								    """
 								    Perform one step of fixing the encoding of text.
 								    """
 								    if config is None:
 								        config = TextFixerConfig()
 								    if len(text) == 0:
 								        return ExplainedText(text, [])
 								    # The first plan is to return ASCII text unchanged, as well as text
 								    # that doesn't look like it contains mojibake
 								    if chardata.possible_encoding(text, "ascii") or not is_bad(text):
 								        return ExplainedText(text, [])
 								    # As we go through the next step, remember the possible encodings
 								    # that we encounter but don't successfully fix yet. We may need them
 								    # later.
 								    possible_1byte_encodings = []
 								    # Suppose the text was supposed to be UTF-8, but it was decoded using
 								    # a single-byte encoding instead. When these cases can be fixed, they
 								    # are usually the correct thing to do, so try them next.
 								    for encoding in chardata.CHARMAP_ENCODINGS:
 								        if chardata.possible_encoding(text, encoding):
 								            possible_1byte_encodings.append(encoding)
 								            encoded_bytes = text.encode(encoding)
 								            encode_step = ("encode", encoding)
 								            transcode_steps = []
 								            # Now, find out if it's UTF-8 (or close enough). Otherwise,
 								            # remember the encoding for later.
 								            try:
 								                decoding = "utf-8"
 								                # Check encoded_bytes for sequences that would be UTF-8,
 								                # except they have b' ' where b'\xa0' would belong.
 								                if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search(
 								                    encoded_bytes
 								                ):
 								                    replaced_bytes = fixes.restore_byte_a0(encoded_bytes)
 								                    if replaced_bytes != encoded_bytes:
 								                        transcode_steps.append(("transcode", "restore_byte_a0"))
 								                        encoded_bytes = replaced_bytes
 								                # Replace sequences where information has been lost
 								                if config.replace_lossy_sequences and encoding.startswith("sloppy"):
 								                    replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes)
 								                    if replaced_bytes != encoded_bytes:
 								                        transcode_steps.append(("transcode", "replace_lossy_sequences"))
 								                        encoded_bytes = replaced_bytes
 								                if 0xED in encoded_bytes or 0xC0 in encoded_bytes:
 								                    decoding = "utf-8-variants"
 								                decode_step = ("decode", decoding)
 								                steps = [encode_step] + transcode_steps + [decode_step]
 								                fixed = encoded_bytes.decode(decoding)
 								                return ExplainedText(fixed, steps)
 								            except UnicodeDecodeError:
 								                pass
 								    # Look for a-hat-euro sequences that remain, and fix them in isolation.
 								    if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text):
 								        steps = [("apply", "decode_inconsistent_utf8")]
 								        fixed = fixes.decode_inconsistent_utf8(text)
 								        if fixed != text:
 								            return ExplainedText(fixed, steps)
 								    # The next most likely case is that this is Latin-1 that was intended to
 								    # be read as Windows-1252, because those two encodings in particular are
 								    # easily confused.
 								    if "latin-1" in possible_1byte_encodings:
 								        if "windows-1252" in possible_1byte_encodings:
 								            # This text is in the intersection of Latin-1 and
 								            # Windows-1252, so it's probably legit.
 								            return ExplainedText(text, [])
 								        else:
 								            # Otherwise, it means we have characters that are in Latin-1 but
 								            # not in Windows-1252. Those are C1 control characters. Nobody
 								            # wants those. Assume they were meant to be Windows-1252.
 								            try:
 								                fixed = text.encode("latin-1").decode("windows-1252")
 								                if fixed != text:
 								                    steps = [("encode", "latin-1"), ("decode", "windows-1252")]
 								                    return ExplainedText(fixed, steps)
 								            except UnicodeDecodeError:
 								                pass
 								    # Fix individual characters of Latin-1 with a less satisfying explanation
 								    if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text):
 								        steps = [("transcode", "fix_c1_controls")]
 								        fixed = fixes.fix_c1_controls(text)
 								        return ExplainedText(fixed, steps)
 								    # The cases that remain are mixups between two different single-byte
 								    # encodings, and not the common case of Latin-1 vs. Windows-1252.
 								    #
 								    # With the new heuristic in 6.0, it's possible that we're closer to solving
 								    # these in some cases. It would require a lot of testing and tuning, though.
 								    # For now, we leave the text unchanged in these cases.
 								    return ExplainedText(text, [])
 								def fix_encoding(text: str, config: TextFixerConfig = None, **kwargs):
 								    """
 								    Apply just the encoding-fixing steps of ftfy to this text. Returns the
 								    fixed text, discarding the explanation.
 								        >>> fix_encoding("Ã³")
 								        'ó'
 								        >>> fix_encoding("&ATILDE;&SUP3;")
 								        '&ATILDE;&SUP3;'
 								    """
 								    if config is None:
 								        config = TextFixerConfig(explain=False)
 								    config = _config_from_kwargs(config, kwargs)
 								    fixed, _explan = fix_encoding_and_explain(text, config)
 								    return fixed
-												update deps

											
										
										
											6 years ago
 								# Some alternate names for the main functions
 								ftfy = fix_text
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
 								def fix_text_segment(text: str, config: TextFixerConfig = None, **kwargs):
 								    """
 								    Fix text as a single segment, with a consistent sequence of steps that
 								    are applied to fix the text. Discard the explanation.
 								    """
 								    if config is None:
 								        config = TextFixerConfig(explain=False)
 								    config = _config_from_kwargs(config, kwargs)
 								    fixed, _explan = fix_and_explain(text, config)
 								    return fixed
 								def fix_file(input_file, encoding=None, config=None, **kwargs):
-												update deps

											
										
										
											6 years ago
+								    """
 								    Fix text that is found in a file.
 								    If the file is being read as Unicode text, use that. If it's being read as
 								    bytes, then we hope an encoding was supplied. If not, unfortunately, we
 								    have to guess what encoding it is. We'll try a few common encodings, but we
 								    make no promises. See the `guess_bytes` function for how this is done.
 								    The output is a stream of fixed lines of text.
 								    """
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    if config is None:
 								        config = TextFixerConfig()
 								    config = _config_from_kwargs(config, kwargs)
-												update deps

											
										
										
											6 years ago
+								    for line in input_file:
 								        if isinstance(line, bytes):
 								            if encoding is None:
 								                line, encoding = guess_bytes(line)
 								            else:
 								                line = line.decode(encoding)
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								        if config.unescape_html == "auto" and "<" in line:
 								            config = config._replace(unescape_html=False)
-												update deps

											
										
										
											6 years ago
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								        fixed_line, _explan = fix_and_explain(line, config)
 								        yield fixed_line
-												update deps

											
										
										
											6 years ago
 								def guess_bytes(bstring):
 								    """
 								    NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy
 								    is not designed to be an encoding detector.
 								    In the unfortunate situation that you have some bytes in an unknown
 								    encoding, ftfy can guess a reasonable strategy for decoding them, by trying
 								    a few common encodings that can be distinguished from each other.
 								    Unlike the rest of ftfy, this may not be accurate, and it may *create*
 								    Unicode problems instead of solving them!
 								    The encodings we try here are:
 								    - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
 								      like nothing else
 								    - UTF-8, because it's the global standard, which has been used by a
 								      majority of the Web since 2008
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    - "utf-8-variants", or buggy implementations of UTF-8
-												update deps

											
										
										
											6 years ago
+								    - MacRoman, because Microsoft Office thinks it's still a thing, and it
 								      can be distinguished by its line breaks. (If there are no line breaks in
 								      the string, though, you're out of luck.)
 								    - "sloppy-windows-1252", the Latin-1-like encoding that is the most common
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								      single-byte encoding.
-												update deps

											
										
										
											6 years ago
+								    """
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    if isinstance(bstring, str):
-												update deps

											
										
										
											6 years ago
+								        raise UnicodeError(
 								            "This string was already decoded as Unicode. You should pass "
 								            "bytes to guess_bytes, not Unicode."
 								        )
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"):
 								        return bstring.decode("utf-16"), "utf-16"
-												update deps

											
										
										
											6 years ago
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    byteset = set(bstring)
-												update deps

											
										
										
											6 years ago
+								    try:
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								        if 0xED in byteset or 0xC0 in byteset:
-												update deps

											
										
										
											6 years ago
+								            # Byte 0xed can be used to encode a range of codepoints that
 								            # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
 								            # so when we see 0xed, it's very likely we're being asked to
 								            # decode CESU-8, the variant that encodes UTF-16 surrogates
 								            # instead of the original characters themselves.
 								            #
 								            # This will occasionally trigger on standard UTF-8, as there
 								            # are some Korean characters that also use byte 0xed, but that's
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								            # not harmful because standard UTF-8 characters will decode the
 								            # same way in our 'utf-8-variants' codec.
-												update deps

											
										
										
											6 years ago
+								            #
 								            # Byte 0xc0 is impossible because, numerically, it would only
 								            # encode characters lower than U+0040. Those already have
 								            # single-byte representations, and UTF-8 requires using the
 								            # shortest possible representation. However, Java hides the null
 								            # codepoint, U+0000, in a non-standard longer representation -- it
 								            # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
 								            # will never appear in the encoded bytes.
 								            #
 								            # The 'utf-8-variants' decoder can handle both of these cases, as
 								            # well as standard UTF-8, at the cost of a bit of speed.
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								            return bstring.decode("utf-8-variants"), "utf-8-variants"
-												update deps

											
										
										
											6 years ago
+								        else:
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								            return bstring.decode("utf-8"), "utf-8"
-												update deps

											
										
										
											6 years ago
+								    except UnicodeDecodeError:
 								        pass
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    if 0x0D in byteset and 0x0A not in byteset:
 								        # Files that contain CR and not LF are likely to be MacRoman.
 								        return bstring.decode("macroman"), "macroman"
 								    return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252"
-												Updated vendored dependencies.

											
										
										
											2 years ago
+								@no_type_check
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								def apply_plan(text: str, plan: List[Tuple[str, str]]):
 								    """
 								    Apply a plan for fixing the encoding of text.
 								    The plan is a list of tuples of the form (operation, arg).
 								    `operation` is one of:
 								    - `'encode'`: convert a string to bytes, using `arg` as the encoding
 								    - `'decode'`: convert bytes to a string, using `arg` as the encoding
 								    - `'transcode'`: convert bytes to bytes, using the function named `arg`
 								    - `'apply'`: convert a string to a string, using the function named `arg`
-												update deps

											
										
										
											6 years ago
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    The functions that can be applied by 'transcode' and 'apply' are
 								    specifically those that appear in the dictionary named `FIXERS`. They
 								    can also can be imported from the `ftfy.fixes` module.
-												update deps

											
										
										
											6 years ago
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								    Example::
 								        >>> mojibake = "schÃ¶n"
 								        >>> text, plan = fix_and_explain(mojibake)
 								        >>> apply_plan(mojibake, plan)
 								        'schön'
 								    """
 								    obj = text
 								    for operation, encoding in plan:
 								        if operation == "encode":
 								            obj = obj.encode(encoding)
 								        elif operation == "decode":
 								            obj = obj.decode(encoding)
 								        elif operation in ("transcode", "apply"):
 								            if encoding in FIXERS:
 								                obj = FIXERS[encoding](obj)
 								            else:
 								                raise ValueError("Unknown function to apply: %s" % encoding)
 								        else:
 								            raise ValueError("Unknown plan step: %s" % operation)
 								    return obj
 								def explain_unicode(text: str):
-												update deps

											
										
										
											6 years ago
+								    """
 								    A utility method that's useful for debugging mysterious Unicode.
 								    It breaks down a string, showing you for each codepoint its number in
 								    hexadecimal, its glyph, its category in the Unicode standard, and its name
 								    in the Unicode standard.
 								        >>> explain_unicode('(╯°□°)╯︵ ┻━┻')
 								        U+0028  (       [Ps] LEFT PARENTHESIS
 								        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
 								        U+00B0  °       [So] DEGREE SIGN
 								        U+25A1  □       [So] WHITE SQUARE
 								        U+00B0  °       [So] DEGREE SIGN
 								        U+0029  )       [Pe] RIGHT PARENTHESIS
 								        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
 								        U+FE35  ︵      [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
 								        U+0020          [Zs] SPACE
 								        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
 								        U+2501  ━       [So] BOX DRAWINGS HEAVY HORIZONTAL
 								        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
 								    """
 								    for char in text:
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								        if char.isprintable():
-												update deps

											
										
										
											6 years ago
+								            display = char
 								        else:
-												Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.

											
										
										
											3 years ago
+								            display = char.encode("unicode-escape").decode("ascii")
 								        print(
 								            "U+{code:04X}  {display} [{category}] {name}".format(
 								                display=display_ljust(display, 7),
 								                code=ord(char),
 								                category=unicodedata.category(char),
 								                name=unicodedata.name(char, "<unknown>"),
 								            )
 								        )