|
|
|
|
"""Guess the natural language of a text
|
|
|
|
|
"""
|
|
|
|
|
# © 2012 spirit <hiddenspirit@gmail.com>
|
|
|
|
|
# https://bitbucket.org/spirit/guess_language
|
|
|
|
|
#
|
|
|
|
|
# Original Python package:
|
|
|
|
|
# Copyright (c) 2008, Kent S Johnson
|
|
|
|
|
# http://code.google.com/p/guess-language/
|
|
|
|
|
#
|
|
|
|
|
# Original C++ version for KDE:
|
|
|
|
|
# Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>
|
|
|
|
|
# http://websvn.kde.org/branches/work/sonnet-refactoring/common/nlp/guesslanguage.cpp?view=markup
|
|
|
|
|
#
|
|
|
|
|
# Original Language::Guess Perl module:
|
|
|
|
|
# Copyright (c) 2004-2006 Maciej Ceglowski
|
|
|
|
|
# http://web.archive.org/web/20090228163219/http://languid.cantbedone.org/
|
|
|
|
|
#
|
|
|
|
|
# Note: Language::Guess is GPL-licensed. KDE developers received permission
|
|
|
|
|
# from the author to distribute their port under LGPL:
|
|
|
|
|
# http://lists.kde.org/?l=kde-sonnet&m=116910092228811&w=2
|
|
|
|
|
#
|
|
|
|
|
# This program is free software: you can redistribute it and/or modify it
|
|
|
|
|
# under the terms of the GNU Lesser General Public License as published
|
|
|
|
|
# by the Free Software Foundation, either version 3 of the License,
|
|
|
|
|
# or (at your option) any later version.
|
|
|
|
|
#
|
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty
|
|
|
|
|
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
|
# See the GNU Lesser General Public License for more details.
|
|
|
|
|
#
|
|
|
|
|
# You should have received a copy of the GNU Lesser General Public License
|
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
|
|
import functools
|
|
|
|
|
import re
|
|
|
|
|
import warnings
|
|
|
|
|
|
|
|
|
|
from collections import defaultdict, OrderedDict
|
|
|
|
|
|
|
|
|
|
from .data import BLOCKS, BLOCK_RSHIFT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__all__ = [
|
|
|
|
|
"guess_language", "use_enchant",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
MAX_LENGTH = 4096
|
|
|
|
|
MIN_LENGTH = 20
|
|
|
|
|
MAX_GRAMS = 300
|
|
|
|
|
WORD_RE = re.compile(r"(?:[^\W\d_]|['’])+", re.U)
|
|
|
|
|
MODEL_ROOT = __name__ + ".data.models."
|
|
|
|
|
FALLBACK_LANGUAGE = "en_US"
|
|
|
|
|
|
|
|
|
|
BASIC_LATIN = {
|
|
|
|
|
"ceb", "en", "eu", "ha", "haw", "id", "la", "nr", "nso", "so", "ss", "st",
|
|
|
|
|
"sw", "tlh", "tn", "ts", "xh", "zu"
|
|
|
|
|
}
|
|
|
|
|
EXTENDED_LATIN = {
|
|
|
|
|
"af", "az", "ca", "cs", "cy", "da", "de", "eo", "es", "et", "fi", "fr",
|
|
|
|
|
"hr", "hu", "is", "it", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "sk",
|
|
|
|
|
"sl", "sq", "sv", "tl", "tr", "ve", "vi"
|
|
|
|
|
}
|
|
|
|
|
ALL_LATIN = BASIC_LATIN.union(EXTENDED_LATIN)
|
|
|
|
|
CYRILLIC = {"bg", "kk", "ky", "mk", "mn", "ru", "sr", "uk", "uz"}
|
|
|
|
|
ARABIC = {"ar", "fa", "ps", "ur"}
|
|
|
|
|
DEVANAGARI = {"hi", "ne"}
|
|
|
|
|
PT = {"pt_BR", "pt_PT"}
|
|
|
|
|
|
|
|
|
|
# NOTE mn appears twice, once for mongolian script and once for CYRILLIC
|
|
|
|
|
SINGLETONS = [
|
|
|
|
|
("Armenian", "hy"),
|
|
|
|
|
("Hebrew", "he"),
|
|
|
|
|
("Bengali", "bn"),
|
|
|
|
|
("Gurmukhi", "pa"),
|
|
|
|
|
("Greek", "el"),
|
|
|
|
|
("Gujarati", "gu"),
|
|
|
|
|
("Oriya", "or"),
|
|
|
|
|
("Tamil", "ta"),
|
|
|
|
|
("Telugu", "te"),
|
|
|
|
|
("Kannada", "kn"),
|
|
|
|
|
("Malayalam", "ml"),
|
|
|
|
|
("Sinhala", "si"),
|
|
|
|
|
("Thai", "th"),
|
|
|
|
|
("Lao", "lo"),
|
|
|
|
|
("Tibetan", "bo"),
|
|
|
|
|
("Burmese", "my"),
|
|
|
|
|
("Georgian", "ka"),
|
|
|
|
|
("Mongolian", "mn-Mong"),
|
|
|
|
|
("Khmer", "km"),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
NAME_MAP = {
|
|
|
|
|
"ab": "Abkhazian",
|
|
|
|
|
"af": "Afrikaans",
|
|
|
|
|
"ar": "Arabic",
|
|
|
|
|
"az": "Azeri",
|
|
|
|
|
"be": "Byelorussian",
|
|
|
|
|
"bg": "Bulgarian",
|
|
|
|
|
"bn": "Bengali",
|
|
|
|
|
"bo": "Tibetan",
|
|
|
|
|
"br": "Breton",
|
|
|
|
|
"ca": "Catalan",
|
|
|
|
|
"ceb": "Cebuano",
|
|
|
|
|
"cs": "Czech",
|
|
|
|
|
"cy": "Welsh",
|
|
|
|
|
"da": "Danish",
|
|
|
|
|
"de": "German",
|
|
|
|
|
"el": "Greek",
|
|
|
|
|
"en": "English",
|
|
|
|
|
"eo": "Esperanto",
|
|
|
|
|
"es": "Spanish",
|
|
|
|
|
"et": "Estonian",
|
|
|
|
|
"eu": "Basque",
|
|
|
|
|
"fa": "Farsi",
|
|
|
|
|
"fi": "Finnish",
|
|
|
|
|
"fo": "Faroese",
|
|
|
|
|
"fr": "French",
|
|
|
|
|
"fy": "Frisian",
|
|
|
|
|
"gd": "Scots Gaelic",
|
|
|
|
|
"gl": "Galician",
|
|
|
|
|
"gu": "Gujarati",
|
|
|
|
|
"ha": "Hausa",
|
|
|
|
|
"haw": "Hawaiian",
|
|
|
|
|
"he": "Hebrew",
|
|
|
|
|
"hi": "Hindi",
|
|
|
|
|
"hr": "Croatian",
|
|
|
|
|
"hu": "Hungarian",
|
|
|
|
|
"hy": "Armenian",
|
|
|
|
|
"id": "Indonesian",
|
|
|
|
|
"is": "Icelandic",
|
|
|
|
|
"it": "Italian",
|
|
|
|
|
"ja": "Japanese",
|
|
|
|
|
"ka": "Georgian",
|
|
|
|
|
"kk": "Kazakh",
|
|
|
|
|
"km": "Cambodian",
|
|
|
|
|
"ko": "Korean",
|
|
|
|
|
"ku": "Kurdish",
|
|
|
|
|
"ky": "Kyrgyz",
|
|
|
|
|
"la": "Latin",
|
|
|
|
|
"lt": "Lithuanian",
|
|
|
|
|
"lv": "Latvian",
|
|
|
|
|
"mg": "Malagasy",
|
|
|
|
|
"mk": "Macedonian",
|
|
|
|
|
"ml": "Malayalam",
|
|
|
|
|
"mn": "Mongolian",
|
|
|
|
|
"mr": "Marathi",
|
|
|
|
|
"ms": "Malay",
|
|
|
|
|
"nd": "Ndebele",
|
|
|
|
|
"ne": "Nepali",
|
|
|
|
|
"nl": "Dutch",
|
|
|
|
|
"nn": "Nynorsk",
|
|
|
|
|
"no": "Norwegian",
|
|
|
|
|
"nso": "Sepedi",
|
|
|
|
|
"pa": "Punjabi",
|
|
|
|
|
"pl": "Polish",
|
|
|
|
|
"ps": "Pashto",
|
|
|
|
|
"pt": "Portuguese",
|
|
|
|
|
"pt_PT": "Portuguese (Portugal)",
|
|
|
|
|
"pt_BR": "Portuguese (Brazil)",
|
|
|
|
|
"ro": "Romanian",
|
|
|
|
|
"ru": "Russian",
|
|
|
|
|
"sa": "Sanskrit",
|
|
|
|
|
"sh": "Serbo-Croatian",
|
|
|
|
|
"sk": "Slovak",
|
|
|
|
|
"sl": "Slovene",
|
|
|
|
|
"so": "Somali",
|
|
|
|
|
"sq": "Albanian",
|
|
|
|
|
"sr": "Serbian",
|
|
|
|
|
"sv": "Swedish",
|
|
|
|
|
"sw": "Swahili",
|
|
|
|
|
"ta": "Tamil",
|
|
|
|
|
"te": "Telugu",
|
|
|
|
|
"th": "Thai",
|
|
|
|
|
"tl": "Tagalog",
|
|
|
|
|
"tlh": "Klingon",
|
|
|
|
|
"tn": "Setswana",
|
|
|
|
|
"tr": "Turkish",
|
|
|
|
|
"ts": "Tsonga",
|
|
|
|
|
"tw": "Twi",
|
|
|
|
|
"uk": "Ukrainian",
|
|
|
|
|
"ur": "Urdu",
|
|
|
|
|
"uz": "Uzbek",
|
|
|
|
|
"ve": "Venda",
|
|
|
|
|
"vi": "Vietnamese",
|
|
|
|
|
"xh": "Xhosa",
|
|
|
|
|
"zh": "Chinese",
|
|
|
|
|
"zh_TW": "Traditional Chinese (Taiwan)",
|
|
|
|
|
"zu": "Zulu",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
IANA_MAP = {
|
|
|
|
|
"ab": 12026,
|
|
|
|
|
"af": 40,
|
|
|
|
|
"ar": 26020,
|
|
|
|
|
"az": 26030,
|
|
|
|
|
"be": 11890,
|
|
|
|
|
"bg": 26050,
|
|
|
|
|
"bn": 26040,
|
|
|
|
|
"bo": 26601,
|
|
|
|
|
"br": 1361,
|
|
|
|
|
"ca": 3,
|
|
|
|
|
"ceb": 26060,
|
|
|
|
|
"cs": 26080,
|
|
|
|
|
"cy": 26560,
|
|
|
|
|
"da": 26090,
|
|
|
|
|
"de": 26160,
|
|
|
|
|
"el": 26165,
|
|
|
|
|
"en": 26110,
|
|
|
|
|
"eo": 11933,
|
|
|
|
|
"es": 26460,
|
|
|
|
|
"et": 26120,
|
|
|
|
|
"eu": 1232,
|
|
|
|
|
"fa": 26130,
|
|
|
|
|
"fi": 26140,
|
|
|
|
|
"fo": 11817,
|
|
|
|
|
"fr": 26150,
|
|
|
|
|
"fy": 1353,
|
|
|
|
|
"gd": 65555,
|
|
|
|
|
"gl": 1252,
|
|
|
|
|
"gu": 26599,
|
|
|
|
|
"ha": 26170,
|
|
|
|
|
"haw": 26180,
|
|
|
|
|
"he": 26592,
|
|
|
|
|
"hi": 26190,
|
|
|
|
|
"hr": 26070,
|
|
|
|
|
"hu": 26200,
|
|
|
|
|
"hy": 26597,
|
|
|
|
|
"id": 26220,
|
|
|
|
|
"is": 26210,
|
|
|
|
|
"it": 26230,
|
|
|
|
|
"ja": 26235,
|
|
|
|
|
"ka": 26600,
|
|
|
|
|
"kk": 26240,
|
|
|
|
|
"km": 1222,
|
|
|
|
|
"ko": 26255,
|
|
|
|
|
"ku": 11815,
|
|
|
|
|
"ky": 26260,
|
|
|
|
|
"la": 26280,
|
|
|
|
|
"lt": 26300,
|
|
|
|
|
"lv": 26290,
|
|
|
|
|
"mg": 1362,
|
|
|
|
|
"mk": 26310,
|
|
|
|
|
"ml": 26598,
|
|
|
|
|
"mn": 26320,
|
|
|
|
|
"mr": 1201,
|
|
|
|
|
"ms": 1147,
|
|
|
|
|
"ne": 26330,
|
|
|
|
|
"nl": 26100,
|
|
|
|
|
"nn": 172,
|
|
|
|
|
"no": 26340,
|
|
|
|
|
"pa": 65550,
|
|
|
|
|
"pl": 26380,
|
|
|
|
|
"ps": 26350,
|
|
|
|
|
"pt": 26390,
|
|
|
|
|
"ro": 26400,
|
|
|
|
|
"ru": 26410,
|
|
|
|
|
"sa": 1500,
|
|
|
|
|
"sh": 1399,
|
|
|
|
|
"sk": 26430,
|
|
|
|
|
"sl": 26440,
|
|
|
|
|
"so": 26450,
|
|
|
|
|
"sq": 26010,
|
|
|
|
|
"sr": 26420,
|
|
|
|
|
"sv": 26480,
|
|
|
|
|
"sw": 26470,
|
|
|
|
|
"ta": 26595,
|
|
|
|
|
"te": 26596,
|
|
|
|
|
"th": 26594,
|
|
|
|
|
"tl": 26490,
|
|
|
|
|
"tlh": 26250,
|
|
|
|
|
"tn": 65578,
|
|
|
|
|
"tr": 26500,
|
|
|
|
|
"tw": 1499,
|
|
|
|
|
"uk": 26520,
|
|
|
|
|
"ur": 26530,
|
|
|
|
|
"uz": 26540,
|
|
|
|
|
"vi": 26550,
|
|
|
|
|
"zh": 26065,
|
|
|
|
|
"zh_TW": 22,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
models = {}
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
from importlib import import_module
|
|
|
|
|
except ImportError:
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
|
|
def import_module(name):
|
|
|
|
|
"""Import a module.
|
|
|
|
|
"""
|
|
|
|
|
__import__(name)
|
|
|
|
|
return sys.modules[name]
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
from collections import namedtuple
|
|
|
|
|
|
|
|
|
|
LanguageInfo = namedtuple("LanguageInfo", ["tag", "id", "name"])
|
|
|
|
|
except ImportError:
|
|
|
|
|
class LanguageInfo(tuple):
|
|
|
|
|
def __new__(cls, tag, id, name): #@ReservedAssignment
|
|
|
|
|
return tuple.__new__(cls, (tag, id, name))
|
|
|
|
|
|
|
|
|
|
def __init__(self, tag, id, name): #@ReservedAssignment
|
|
|
|
|
self.tag = tag
|
|
|
|
|
self.id = id
|
|
|
|
|
self.name = name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class UNKNOWN(str):
|
|
|
|
|
"""Unknown language
|
|
|
|
|
"""
|
|
|
|
|
def __bool__(self):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
UNKNOWN = UNKNOWN("UNKNOWN")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def guess_language(text: str, hints=None):
|
|
|
|
|
"""Return the ISO 639-1 language code.
|
|
|
|
|
"""
|
|
|
|
|
words = WORD_RE.findall(text[:MAX_LENGTH].replace("’", "'"))
|
|
|
|
|
return identify(words, find_runs(words), hints)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def guess_language_info(text: str, hints=None):
|
|
|
|
|
"""Return LanguageInfo(tag, id, name).
|
|
|
|
|
"""
|
|
|
|
|
tag = guess_language(text, hints)
|
|
|
|
|
|
|
|
|
|
if tag is UNKNOWN:
|
|
|
|
|
return LanguageInfo(UNKNOWN, UNKNOWN, UNKNOWN)
|
|
|
|
|
|
|
|
|
|
return LanguageInfo(tag, _get_id(tag), _get_name(tag))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# An alias for guess_language
|
|
|
|
|
guess_language_tag = guess_language
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def guess_language_id(text: str, hints=None):
|
|
|
|
|
"""Return the language ID.
|
|
|
|
|
"""
|
|
|
|
|
return _get_id(guess_language(text, hints))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def guess_language_name(text: str, hints=None):
|
|
|
|
|
"""Return the language name (in English).
|
|
|
|
|
"""
|
|
|
|
|
return _get_name(guess_language(text, hints))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_id(tag):
|
|
|
|
|
return IANA_MAP.get(tag, UNKNOWN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_name(tag):
|
|
|
|
|
return NAME_MAP.get(tag, UNKNOWN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_runs(words):
|
|
|
|
|
"""Count the number of characters in each character block.
|
|
|
|
|
"""
|
|
|
|
|
run_types = defaultdict(int)
|
|
|
|
|
|
|
|
|
|
total_count = 0
|
|
|
|
|
|
|
|
|
|
for word in words:
|
|
|
|
|
for char in word:
|
|
|
|
|
block = BLOCKS[ord(char) >> BLOCK_RSHIFT]
|
|
|
|
|
run_types[block] += 1
|
|
|
|
|
total_count += 1
|
|
|
|
|
|
|
|
|
|
#pprint(run_types)
|
|
|
|
|
|
|
|
|
|
# return run types that used for 40% or more of the string
|
|
|
|
|
# return Basic Latin if found more than 15%
|
|
|
|
|
## and extended additional latin if over 10% (for Vietnamese)
|
|
|
|
|
relevant_runs = []
|
|
|
|
|
for key, value in run_types.items():
|
|
|
|
|
pct = value * 100 // total_count
|
|
|
|
|
if pct >= 40 or pct >= 15 and key == "Basic Latin":
|
|
|
|
|
relevant_runs.append(key)
|
|
|
|
|
#elif pct >= 10 and key == "Latin Extended Additional":
|
|
|
|
|
#relevant_runs.append(key)
|
|
|
|
|
|
|
|
|
|
return relevant_runs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def identify(words, scripts, hints=None):
|
|
|
|
|
"""Identify the language.
|
|
|
|
|
"""
|
|
|
|
|
if ("Hangul Syllables" in scripts or "Hangul Jamo" in scripts or
|
|
|
|
|
"Hangul Compatibility Jamo" in scripts or "Hangul" in scripts):
|
|
|
|
|
return "ko"
|
|
|
|
|
|
|
|
|
|
if "Greek and Coptic" in scripts:
|
|
|
|
|
return "el"
|
|
|
|
|
|
|
|
|
|
if "Kana" in scripts:
|
|
|
|
|
return "ja"
|
|
|
|
|
|
|
|
|
|
if ("CJK Unified Ideographs" in scripts or "Bopomofo" in scripts or
|
|
|
|
|
"Bopomofo Extended" in scripts or "KangXi Radicals" in scripts):
|
|
|
|
|
# This is in both Ceglowski and Rideout
|
|
|
|
|
# I can't imagine why...
|
|
|
|
|
# or "Arabic Presentation Forms-A" in scripts
|
|
|
|
|
return "zh"
|
|
|
|
|
|
|
|
|
|
if "Cyrillic" in scripts:
|
|
|
|
|
return check(words, filter_languages(CYRILLIC, hints))
|
|
|
|
|
|
|
|
|
|
if ("Arabic" in scripts or "Arabic Presentation Forms-A" in scripts or
|
|
|
|
|
"Arabic Presentation Forms-B" in scripts):
|
|
|
|
|
return check(words, filter_languages(ARABIC, hints))
|
|
|
|
|
|
|
|
|
|
if "Devanagari" in scripts:
|
|
|
|
|
return check(words, filter_languages(DEVANAGARI, hints))
|
|
|
|
|
|
|
|
|
|
# Try languages with unique scripts
|
|
|
|
|
for block_name, lang_name in SINGLETONS:
|
|
|
|
|
if block_name in scripts:
|
|
|
|
|
return lang_name
|
|
|
|
|
|
|
|
|
|
#if "Latin Extended Additional" in scripts:
|
|
|
|
|
#return "vi"
|
|
|
|
|
|
|
|
|
|
if "Extended Latin" in scripts:
|
|
|
|
|
latin_lang = check(words, filter_languages(EXTENDED_LATIN, hints))
|
|
|
|
|
if latin_lang == "pt":
|
|
|
|
|
return check(words, filter_languages(PT))
|
|
|
|
|
else:
|
|
|
|
|
return latin_lang
|
|
|
|
|
|
|
|
|
|
if "Basic Latin" in scripts:
|
|
|
|
|
return check(words, filter_languages(ALL_LATIN, hints))
|
|
|
|
|
|
|
|
|
|
return UNKNOWN
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def filter_languages(languages, hints=None):
|
|
|
|
|
"""Filter languages.
|
|
|
|
|
"""
|
|
|
|
|
return languages.intersection(hints) if hints else languages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def check_with_all(words, languages):
|
|
|
|
|
"""Check what the best match is.
|
|
|
|
|
"""
|
|
|
|
|
return (check_with_enchant(words, languages) or
|
|
|
|
|
check_with_models(words, languages))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
check = check_with_all
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def use_enchant(use_enchant=True):
|
|
|
|
|
"""Enable or disable checking with PyEnchant.
|
|
|
|
|
"""
|
|
|
|
|
global check
|
|
|
|
|
check = check_with_all if use_enchant else check_with_models
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def check_with_models(words, languages):
|
|
|
|
|
"""Check against known models.
|
|
|
|
|
"""
|
|
|
|
|
sample = " ".join(words)
|
|
|
|
|
|
|
|
|
|
if len(sample) < MIN_LENGTH:
|
|
|
|
|
return UNKNOWN
|
|
|
|
|
|
|
|
|
|
scores = []
|
|
|
|
|
model = create_ordered_model(sample) # QMap<int,QString>
|
|
|
|
|
|
|
|
|
|
for key in languages:
|
|
|
|
|
lkey = key.lower()
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
known_model = models[lkey]
|
|
|
|
|
except KeyError:
|
|
|
|
|
try:
|
|
|
|
|
known_model = import_module(MODEL_ROOT + lkey).model
|
|
|
|
|
except ImportError:
|
|
|
|
|
known_model = None
|
|
|
|
|
models[lkey] = known_model
|
|
|
|
|
|
|
|
|
|
if known_model:
|
|
|
|
|
scores.append((distance(model, known_model), key))
|
|
|
|
|
|
|
|
|
|
if not scores:
|
|
|
|
|
return UNKNOWN
|
|
|
|
|
|
|
|
|
|
# we want the lowest score, less distance = greater chance of match
|
|
|
|
|
#pprint(sorted(scores))
|
|
|
|
|
return min(scores)[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_ordered_model(content):
|
|
|
|
|
"""Create a list of trigrams in content sorted by frequency.
|
|
|
|
|
"""
|
|
|
|
|
trigrams = defaultdict(int) # QHash<QString,int>
|
|
|
|
|
content = content.lower()
|
|
|
|
|
|
|
|
|
|
for i in range(len(content) - 2):
|
|
|
|
|
trigrams[content[i:i+3]] += 1
|
|
|
|
|
|
|
|
|
|
return sorted(trigrams.keys(), key=lambda k: (-trigrams[k], k))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def distance(model, known_model):
|
|
|
|
|
"""Calculate the distance to the known model.
|
|
|
|
|
"""
|
|
|
|
|
dist = 0
|
|
|
|
|
|
|
|
|
|
for i, value in enumerate(model[:MAX_GRAMS]):
|
|
|
|
|
if value in known_model:
|
|
|
|
|
dist += abs(i - known_model[value])
|
|
|
|
|
else:
|
|
|
|
|
dist += MAX_GRAMS
|
|
|
|
|
|
|
|
|
|
return dist
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
import enchant
|
|
|
|
|
except ImportError:
|
|
|
|
|
warnings.warn("PyEnchant is unavailable", ImportWarning)
|
|
|
|
|
enchant = None
|
|
|
|
|
|
|
|
|
|
def check_with_enchant(*args, **kwargs):
|
|
|
|
|
return UNKNOWN
|
|
|
|
|
else:
|
|
|
|
|
import locale
|
|
|
|
|
|
|
|
|
|
enchant_base_languages_dict = None
|
|
|
|
|
|
|
|
|
|
def check_with_enchant(words, languages,
|
|
|
|
|
threshold=0.7, min_words=1, dictionaries={}):
|
|
|
|
|
"""Check against installed spelling dictionaries.
|
|
|
|
|
"""
|
|
|
|
|
if len(words) < min_words:
|
|
|
|
|
return UNKNOWN
|
|
|
|
|
|
|
|
|
|
best_score = 0
|
|
|
|
|
best_tag = UNKNOWN
|
|
|
|
|
|
|
|
|
|
for tag, enchant_tag in get_enchant_base_languages_dict().items():
|
|
|
|
|
if tag not in languages:
|
|
|
|
|
continue
|
|
|
|
|
try:
|
|
|
|
|
d = dictionaries[tag]
|
|
|
|
|
except KeyError:
|
|
|
|
|
d = dictionaries[tag] = enchant.Dict(enchant_tag)
|
|
|
|
|
score = sum([1 for word in words if d.check(word)])
|
|
|
|
|
if score > best_score:
|
|
|
|
|
best_score = score
|
|
|
|
|
best_tag = tag
|
|
|
|
|
|
|
|
|
|
if best_score / len(words) < threshold:
|
|
|
|
|
return UNKNOWN
|
|
|
|
|
|
|
|
|
|
return best_tag
|
|
|
|
|
|
|
|
|
|
def get_enchant_base_languages_dict():
|
|
|
|
|
"""Get ordered dictionary of enchant base languages.
|
|
|
|
|
|
|
|
|
|
locale_language, then "en", then the rest.
|
|
|
|
|
"""
|
|
|
|
|
global enchant_base_languages_dict
|
|
|
|
|
if enchant_base_languages_dict is None:
|
|
|
|
|
def get_language_sub_tag(tag):
|
|
|
|
|
return tag.split("_")[0]
|
|
|
|
|
enchant_base_languages_dict = OrderedDict()
|
|
|
|
|
enchant_languages = sorted(enchant.list_languages())
|
|
|
|
|
for full_tag in [get_locale_language(), FALLBACK_LANGUAGE]:
|
|
|
|
|
sub_tag = get_language_sub_tag(full_tag)
|
|
|
|
|
if sub_tag not in enchant_base_languages_dict:
|
|
|
|
|
for tag in [full_tag, sub_tag]:
|
|
|
|
|
try:
|
|
|
|
|
index = enchant_languages.index(tag)
|
|
|
|
|
except ValueError:
|
|
|
|
|
pass
|
|
|
|
|
else:
|
|
|
|
|
enchant_base_languages_dict[sub_tag] = tag
|
|
|
|
|
del enchant_languages[index]
|
|
|
|
|
break
|
|
|
|
|
for tag in enchant_languages:
|
|
|
|
|
sub_tag = get_language_sub_tag(tag)
|
|
|
|
|
if sub_tag not in enchant_base_languages_dict:
|
|
|
|
|
enchant_base_languages_dict[sub_tag] = tag
|
|
|
|
|
return enchant_base_languages_dict
|
|
|
|
|
|
|
|
|
|
def get_locale_language():
|
|
|
|
|
"""Get the language code for the current locale setting.
|
|
|
|
|
"""
|
|
|
|
|
return (locale.getlocale()[0] or locale.getdefaultlocale()[0] or
|
|
|
|
|
FALLBACK_LANGUAGE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def deprecated(func):
|
|
|
|
|
"""This is a decorator which can be used to mark functions
|
|
|
|
|
as deprecated. It will result in a warning being emitted
|
|
|
|
|
when the function is used.
|
|
|
|
|
"""
|
|
|
|
|
@functools.wraps(func)
|
|
|
|
|
def new_func(*args, **kwargs):
|
|
|
|
|
warnings.warn(
|
|
|
|
|
"call to deprecated function %s()" % func.__name__,
|
|
|
|
|
category=DeprecationWarning,
|
|
|
|
|
stacklevel=2
|
|
|
|
|
)
|
|
|
|
|
return func(*args, **kwargs)
|
|
|
|
|
return new_func
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@deprecated
|
|
|
|
|
def guessLanguage(text):
|
|
|
|
|
"""Deprecated function - use guess_language() instead.
|
|
|
|
|
"""
|
|
|
|
|
return guess_language(decode_text(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@deprecated
|
|
|
|
|
def guessLanguageTag(text):
|
|
|
|
|
"""Deprecated function - use guess_language_tag() instead.
|
|
|
|
|
"""
|
|
|
|
|
return guess_language_tag(decode_text(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@deprecated
|
|
|
|
|
def guessLanguageId(text):
|
|
|
|
|
"""Deprecated function - use guess_language_id() instead.
|
|
|
|
|
"""
|
|
|
|
|
return guess_language_id(decode_text(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@deprecated
|
|
|
|
|
def guessLanguageName(text):
|
|
|
|
|
"""Deprecated function - use guess_language_name() instead.
|
|
|
|
|
"""
|
|
|
|
|
return guess_language_name(decode_text(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@deprecated
|
|
|
|
|
def guessLanguageInfo(text):
|
|
|
|
|
"""Deprecated function - use guess_language_info() instead.
|
|
|
|
|
"""
|
|
|
|
|
return guess_language_info(decode_text(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def decode_text(text, encoding="utf-8"):
|
|
|
|
|
"""Decode text if needed (for deprecated functions).
|
|
|
|
|
"""
|
|
|
|
|
if not isinstance(text, str):
|
|
|
|
|
warnings.warn("passing an encoded string is deprecated",
|
|
|
|
|
DeprecationWarning, 4)
|
|
|
|
|
text = text.decode(encoding)
|
|
|
|
|
return text
|