|
|
|
"""
|
|
|
|
A command-line utility for fixing text found in a file.
|
|
|
|
"""
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
|
|
|
|
from ftfy import __version__, fix_file, TextFixerConfig
|
|
|
|
|
|
|
|
ENCODE_ERROR_TEXT_UNIX = """ftfy error:
|
|
|
|
Unfortunately, this output stream does not support Unicode.
|
|
|
|
|
|
|
|
Your system locale may be very old or misconfigured. You should use a locale
|
|
|
|
that supports UTF-8. One way to do this is to `export LANG=C.UTF-8`.
|
|
|
|
"""
|
|
|
|
|
|
|
|
ENCODE_ERROR_TEXT_WINDOWS = """ftfy error:
|
|
|
|
Unfortunately, this output stream does not support Unicode.
|
|
|
|
|
|
|
|
You might be trying to output to the Windows Command Prompt (cmd.exe), which
|
|
|
|
does not fully support Unicode for historical reasons. In general, we recommend
|
|
|
|
finding a way to run Python without using cmd.exe.
|
|
|
|
|
|
|
|
You can work around this problem by using the '-o filename' option in ftfy to
|
|
|
|
output to a file instead.
|
|
|
|
"""
|
|
|
|
|
|
|
|
DECODE_ERROR_TEXT = """ftfy error:
|
|
|
|
This input couldn't be decoded as %r. We got the following error:
|
|
|
|
|
|
|
|
%s
|
|
|
|
|
|
|
|
ftfy works best when its input is in a known encoding. You can use `ftfy -g`
|
|
|
|
to guess, if you're desperate. Otherwise, give the encoding name with the
|
|
|
|
`-e` option, such as `ftfy -e latin-1`.
|
|
|
|
"""
|
|
|
|
|
|
|
|
SAME_FILE_ERROR_TEXT = """ftfy error:
|
|
|
|
Can't read and write the same file. Please output to a new file instead.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
"""
|
|
|
|
Run ftfy as a command-line utility.
|
|
|
|
"""
|
|
|
|
import argparse
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description="ftfy (fixes text for you), version %s" % __version__
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'filename',
|
|
|
|
default='-',
|
|
|
|
nargs='?',
|
|
|
|
help='The file whose Unicode is to be fixed. Defaults '
|
|
|
|
'to -, meaning standard input.',
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'-o',
|
|
|
|
'--output',
|
|
|
|
type=str,
|
|
|
|
default='-',
|
|
|
|
help='The file to output to. Defaults to -, meaning ' 'standard output.',
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'-g',
|
|
|
|
'--guess',
|
|
|
|
action='store_true',
|
|
|
|
help="Ask ftfy to guess the encoding of your input. "
|
|
|
|
"This is risky. Overrides -e.",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'-e',
|
|
|
|
'--encoding',
|
|
|
|
type=str,
|
|
|
|
default='utf-8',
|
|
|
|
help='The encoding of the input. Defaults to UTF-8.',
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'-n',
|
|
|
|
'--normalization',
|
|
|
|
type=str,
|
|
|
|
default='NFC',
|
|
|
|
help='The normalization of Unicode to apply. '
|
|
|
|
'Defaults to NFC. Can be "none".',
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--preserve-entities',
|
|
|
|
action='store_true',
|
|
|
|
help="Leave HTML entities as they are. The default "
|
|
|
|
"is to decode them, as long as no HTML tags "
|
|
|
|
"have appeared in the file.",
|
|
|
|
)
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
encoding = args.encoding
|
|
|
|
if args.guess:
|
|
|
|
encoding = None
|
|
|
|
|
|
|
|
if args.filename == '-':
|
|
|
|
# Get a standard input stream made of bytes, so we can decode it as
|
|
|
|
# whatever encoding is necessary.
|
|
|
|
file = sys.stdin.buffer
|
|
|
|
else:
|
|
|
|
file = open(args.filename, 'rb')
|
|
|
|
|
|
|
|
if args.output == '-':
|
|
|
|
outfile = sys.stdout
|
|
|
|
else:
|
|
|
|
if os.path.realpath(args.output) == os.path.realpath(args.filename):
|
|
|
|
sys.stderr.write(SAME_FILE_ERROR_TEXT)
|
|
|
|
sys.exit(1)
|
|
|
|
outfile = open(args.output, 'w', encoding='utf-8')
|
|
|
|
|
|
|
|
normalization = args.normalization
|
|
|
|
if normalization.lower() == 'none':
|
|
|
|
normalization = None
|
|
|
|
|
|
|
|
if args.preserve_entities:
|
|
|
|
unescape_html = False
|
|
|
|
else:
|
|
|
|
unescape_html = 'auto'
|
|
|
|
|
|
|
|
config = TextFixerConfig(
|
|
|
|
unescape_html=unescape_html,
|
|
|
|
normalization=normalization
|
|
|
|
)
|
|
|
|
|
|
|
|
try:
|
|
|
|
for line in fix_file(
|
|
|
|
file,
|
|
|
|
encoding=encoding,
|
|
|
|
config=config
|
|
|
|
):
|
|
|
|
try:
|
|
|
|
outfile.write(line)
|
|
|
|
except UnicodeEncodeError:
|
|
|
|
if sys.platform == 'win32':
|
|
|
|
sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS)
|
|
|
|
else:
|
|
|
|
sys.stderr.write(ENCODE_ERROR_TEXT_UNIX)
|
|
|
|
sys.exit(1)
|
|
|
|
except UnicodeDecodeError as err:
|
|
|
|
sys.stderr.write(DECODE_ERROR_TEXT % (encoding, err))
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|