You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
48 lines
1.7 KiB
48 lines
1.7 KiB
# coding: utf-8
|
|
"""
|
|
This file defines a general method for evaluating ftfy using data that arrives
|
|
in a stream. A concrete implementation of it is found in `twitter_tester.py`.
|
|
"""
|
|
from __future__ import print_function, unicode_literals
|
|
from ftfy import fix_text
|
|
from ftfy.fixes import fix_encoding, unescape_html
|
|
from ftfy.chardata import possible_encoding
|
|
|
|
|
|
class StreamTester:
|
|
"""
|
|
Take in a sequence of texts, and show the ones that will be changed by
|
|
ftfy. This will also periodically show updates, such as the proportion of
|
|
texts that changed.
|
|
"""
|
|
def __init__(self):
|
|
self.num_fixed = 0
|
|
self.count = 0
|
|
|
|
def check_ftfy(self, text, encoding_only=True):
|
|
"""
|
|
Given a single text input, check whether `ftfy.fix_text_encoding`
|
|
would change it. If so, display the change.
|
|
"""
|
|
self.count += 1
|
|
text = unescape_html(text)
|
|
if not possible_encoding(text, 'ascii'):
|
|
if encoding_only:
|
|
fixed = fix_encoding(text)
|
|
else:
|
|
fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
|
|
if text != fixed:
|
|
# possibly filter common bots before printing
|
|
print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
|
|
text=text, fixed=fixed
|
|
))
|
|
self.num_fixed += 1
|
|
elif 'â€' in text or '\x80' in text:
|
|
print('\nNot fixed:\t{text!r}'.format(text=text))
|
|
|
|
# Print status updates once in a while
|
|
if self.count % 100 == 0:
|
|
print('.', end='', flush=True)
|
|
if self.count % 10000 == 0:
|
|
print('\n%d/%d fixed' % (self.num_fixed, self.count))
|