|
|
|
|
#!/usr/bin/env python
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
"""
|
|
|
|
|
Metadata about languages used by our model training code for our
|
|
|
|
|
SingleByteCharSetProbers. Could be used for other things in the future.
|
|
|
|
|
|
|
|
|
|
This code is based on the language metadata from the uchardet project.
|
|
|
|
|
"""
|
|
|
|
|
from __future__ import absolute_import, print_function
|
|
|
|
|
|
|
|
|
|
from string import ascii_letters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# TODO: Add Ukranian (KOI8-U)
|
|
|
|
|
|
|
|
|
|
class Language(object):
|
|
|
|
|
"""Metadata about a language useful for training models
|
|
|
|
|
|
|
|
|
|
:ivar name: The human name for the language, in English.
|
|
|
|
|
:type name: str
|
|
|
|
|
:ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
|
|
|
|
or use another catalog as a last resort.
|
|
|
|
|
:type iso_code: str
|
|
|
|
|
:ivar use_ascii: Whether or not ASCII letters should be included in trained
|
|
|
|
|
models.
|
|
|
|
|
:type use_ascii: bool
|
|
|
|
|
:ivar charsets: The charsets we want to support and create data for.
|
|
|
|
|
:type charsets: list of str
|
|
|
|
|
:ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
|
|
|
|
|
`True`, you only need to add those not in the ASCII set.
|
|
|
|
|
:type alphabet: str
|
|
|
|
|
:ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
|
|
|
|
|
Wikipedia for training data.
|
|
|
|
|
:type wiki_start_pages: list of str
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None,
|
|
|
|
|
alphabet=None, wiki_start_pages=None):
|
|
|
|
|
super(Language, self).__init__()
|
|
|
|
|
self.name = name
|
|
|
|
|
self.iso_code = iso_code
|
|
|
|
|
self.use_ascii = use_ascii
|
|
|
|
|
self.charsets = charsets
|
|
|
|
|
if self.use_ascii:
|
|
|
|
|
if alphabet:
|
|
|
|
|
alphabet += ascii_letters
|
|
|
|
|
else:
|
|
|
|
|
alphabet = ascii_letters
|
|
|
|
|
elif not alphabet:
|
|
|
|
|
raise ValueError('Must supply alphabet if use_ascii is False')
|
|
|
|
|
self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None
|
|
|
|
|
self.wiki_start_pages = wiki_start_pages
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return '{}({})'.format(self.__class__.__name__,
|
|
|
|
|
', '.join('{}={!r}'.format(k, v)
|
|
|
|
|
for k, v in self.__dict__.items()
|
|
|
|
|
if not k.startswith('_')))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LANGUAGES = {'Arabic': Language(name='Arabic',
|
|
|
|
|
iso_code='ar',
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
# We only support encodings that use isolated
|
|
|
|
|
# forms, because the current recommendation is
|
|
|
|
|
# that the rendering system handles presentation
|
|
|
|
|
# forms. This means we purposefully skip IBM864.
|
|
|
|
|
charsets=['ISO-8859-6', 'WINDOWS-1256',
|
|
|
|
|
'CP720', 'CP864'],
|
|
|
|
|
alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ',
|
|
|
|
|
wiki_start_pages=[u'الصفحة_الرئيسية']),
|
|
|
|
|
'Belarusian': Language(name='Belarusian',
|
|
|
|
|
iso_code='be',
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
|
|
|
|
'IBM866', 'MacCyrillic'],
|
|
|
|
|
alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ'
|
|
|
|
|
u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'),
|
|
|
|
|
wiki_start_pages=[u'Галоўная_старонка']),
|
|
|
|
|
'Bulgarian': Language(name='Bulgarian',
|
|
|
|
|
iso_code='bg',
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
|
|
|
|
'IBM855'],
|
|
|
|
|
alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ'
|
|
|
|
|
u'абвгдежзийклмнопрстуфхцчшщъьюя'),
|
|
|
|
|
wiki_start_pages=[u'Начална_страница']),
|
|
|
|
|
'Czech': Language(name='Czech',
|
|
|
|
|
iso_code='cz',
|
|
|
|
|
use_ascii=True,
|
|
|
|
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
|
|
|
|
alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ',
|
|
|
|
|
wiki_start_pages=[u'Hlavní_strana']),
|
|
|
|
|
'Danish': Language(name='Danish',
|
|
|
|
|
iso_code='da',
|
|
|
|
|
use_ascii=True,
|
|
|
|
|
charsets=['ISO-8859-1', 'ISO-8859-15',
|
|
|
|
|
'WINDOWS-1252'],
|
|
|
|
|
alphabet=u'æøåÆØÅ',
|
|
|
|
|
wiki_start_pages=[u'Forside']),
|
|
|
|
|
'German': Language(name='German',
|
|
|
|
|
iso_code='de',
|
|
|
|
|
use_ascii=True,
|
|
|
|
|
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
|
|
|
|
alphabet=u'äöüßÄÖÜ',
|
|
|
|
|
wiki_start_pages=[u'Wikipedia:Hauptseite']),
|
|
|
|
|
'Greek': Language(name='Greek',
|
|
|
|
|
iso_code='el',
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-7', 'WINDOWS-1253'],
|
|
|
|
|
alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
|
|
|
|
|
u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'),
|
|
|
|
|
wiki_start_pages=[u'Πύλη:Κύρια']),
|
|
|
|
|
'English': Language(name='English',
|
|
|
|
|
iso_code='en',
|
|
|
|
|
use_ascii=True,
|
|
|
|
|
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
|
|
|
|
wiki_start_pages=[u'Main_Page']),
|
|
|
|
|
'Esperanto': Language(name='Esperanto',
|
|
|
|
|
iso_code='eo',
|
|
|
|
|
# Q, W, X, and Y not used at all
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-3'],
|
|
|
|
|
alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
|
|
|
|
|
u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'),
|
|
|
|
|
wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']),
|
|
|
|
|
'Spanish': Language(name='Spanish',
|
|
|
|
|
iso_code='es',
|
|
|
|
|
use_ascii=True,
|
|
|
|
|
charsets=['ISO-8859-1', 'ISO-8859-15',
|
|
|
|
|
'WINDOWS-1252'],
|
|
|
|
|
alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ',
|
|
|
|
|
wiki_start_pages=[u'Wikipedia:Portada']),
|
|
|
|
|
'Estonian': Language(name='Estonian',
|
|
|
|
|
iso_code='et',
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-4', 'ISO-8859-13',
|
|
|
|
|
'WINDOWS-1257'],
|
|
|
|
|
# C, F, Š, Q, W, X, Y, Z, Ž are only for
|
|
|
|
|
# loanwords
|
|
|
|
|
alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ'
|
|
|
|
|
u'abdeghijklmnoprstuvõäöü'),
|
|
|
|
|
wiki_start_pages=[u'Esileht']),
|
|
|
|
|
'Finnish': Language(name='Finnish',
|
|
|
|
|
iso_code='fi',
|
|
|
|
|
use_ascii=True,
|
|
|
|
|
charsets=['ISO-8859-1', 'ISO-8859-15',
|
|
|
|
|
'WINDOWS-1252'],
|
|
|
|
|
alphabet=u'ÅÄÖŠŽåäöšž',
|
|
|
|
|
wiki_start_pages=[u'Wikipedia:Etusivu']),
|
|
|
|
|
'French': Language(name='French',
|
|
|
|
|
iso_code='fr',
|
|
|
|
|
use_ascii=True,
|
|
|
|
|
charsets=['ISO-8859-1', 'ISO-8859-15',
|
|
|
|
|
'WINDOWS-1252'],
|
|
|
|
|
alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ',
|
|
|
|
|
wiki_start_pages=[u'Wikipédia:Accueil_principal',
|
|
|
|
|
u'Bœuf (animal)']),
|
|
|
|
|
'Hebrew': Language(name='Hebrew',
|
|
|
|
|
iso_code='he',
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-8', 'WINDOWS-1255'],
|
|
|
|
|
alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ',
|
|
|
|
|
wiki_start_pages=[u'עמוד_ראשי']),
|
|
|
|
|
'Croatian': Language(name='Croatian',
|
|
|
|
|
iso_code='hr',
|
|
|
|
|
# Q, W, X, Y are only used for foreign words.
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
|
|
|
|
alphabet=(u'abcčćdđefghijklmnoprsštuvzž'
|
|
|
|
|
u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'),
|
|
|
|
|
wiki_start_pages=[u'Glavna_stranica']),
|
|
|
|
|
'Hungarian': Language(name='Hungarian',
|
|
|
|
|
iso_code='hu',
|
|
|
|
|
# Q, W, X, Y are only used for foreign words.
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
|
|
|
|
alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű'
|
|
|
|
|
u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'),
|
|
|
|
|
wiki_start_pages=[u'Kezdőlap']),
|
|
|
|
|
'Italian': Language(name='Italian',
|
|
|
|
|
iso_code='it',
|
|
|
|
|
use_ascii=True,
|
|
|
|
|
charsets=['ISO-8859-1', 'ISO-8859-15',
|
|
|
|
|
'WINDOWS-1252'],
|
|
|
|
|
alphabet=u'ÀÈÉÌÒÓÙàèéìòóù',
|
|
|
|
|
wiki_start_pages=[u'Pagina_principale']),
|
|
|
|
|
'Lithuanian': Language(name='Lithuanian',
|
|
|
|
|
iso_code='lt',
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
|
|
|
|
'ISO-8859-4'],
|
|
|
|
|
# Q, W, and X not used at all
|
|
|
|
|
alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ'
|
|
|
|
|
u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'),
|
|
|
|
|
wiki_start_pages=[u'Pagrindinis_puslapis']),
|
|
|
|
|
'Latvian': Language(name='Latvian',
|
|
|
|
|
iso_code='lv',
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
|
|
|
|
'ISO-8859-4'],
|
|
|
|
|
# Q, W, X, Y are only for loanwords
|
|
|
|
|
alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ'
|
|
|
|
|
u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'),
|
|
|
|
|
wiki_start_pages=[u'Sākumlapa']),
|
|
|
|
|
'Macedonian': Language(name='Macedonian',
|
|
|
|
|
iso_code='mk',
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
|
|
|
|
'MacCyrillic', 'IBM855'],
|
|
|
|
|
alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ'
|
|
|
|
|
u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'),
|
|
|
|
|
wiki_start_pages=[u'Главна_страница']),
|
|
|
|
|
'Dutch': Language(name='Dutch',
|
|
|
|
|
iso_code='nl',
|
|
|
|
|
use_ascii=True,
|
|
|
|
|
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
|
|
|
|
wiki_start_pages=[u'Hoofdpagina']),
|
|
|
|
|
'Polish': Language(name='Polish',
|
|
|
|
|
iso_code='pl',
|
|
|
|
|
# Q and X are only used for foreign words.
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
|
|
|
|
alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ'
|
|
|
|
|
u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'),
|
|
|
|
|
wiki_start_pages=[u'Wikipedia:Strona_główna']),
|
|
|
|
|
'Portuguese': Language(name='Portuguese',
|
|
|
|
|
iso_code='pt',
|
|
|
|
|
use_ascii=True,
|
|
|
|
|
charsets=['ISO-8859-1', 'ISO-8859-15',
|
|
|
|
|
'WINDOWS-1252'],
|
|
|
|
|
alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú',
|
|
|
|
|
wiki_start_pages=[u'Wikipédia:Página_principal']),
|
|
|
|
|
'Romanian': Language(name='Romanian',
|
|
|
|
|
iso_code='ro',
|
|
|
|
|
use_ascii=True,
|
|
|
|
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
|
|
|
|
alphabet=u'ăâîșțĂÂÎȘȚ',
|
|
|
|
|
wiki_start_pages=[u'Pagina_principală']),
|
|
|
|
|
'Russian': Language(name='Russian',
|
|
|
|
|
iso_code='ru',
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
|
|
|
|
'KOI8-R', 'MacCyrillic', 'IBM866',
|
|
|
|
|
'IBM855'],
|
|
|
|
|
alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
|
|
|
|
|
u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'),
|
|
|
|
|
wiki_start_pages=[u'Заглавная_страница']),
|
|
|
|
|
'Slovak': Language(name='Slovak',
|
|
|
|
|
iso_code='sk',
|
|
|
|
|
use_ascii=True,
|
|
|
|
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
|
|
|
|
alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ',
|
|
|
|
|
wiki_start_pages=[u'Hlavná_stránka']),
|
|
|
|
|
'Slovene': Language(name='Slovene',
|
|
|
|
|
iso_code='sl',
|
|
|
|
|
# Q, W, X, Y are only used for foreign words.
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
|
|
|
|
alphabet=(u'abcčdefghijklmnoprsštuvzž'
|
|
|
|
|
u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'),
|
|
|
|
|
wiki_start_pages=[u'Glavna_stran']),
|
|
|
|
|
# Serbian can be written in both Latin and Cyrillic, but there's no
|
|
|
|
|
# simple way to get the Latin alphabet pages from Wikipedia through
|
|
|
|
|
# the API, so for now we just support Cyrillic.
|
|
|
|
|
'Serbian': Language(name='Serbian',
|
|
|
|
|
iso_code='sr',
|
|
|
|
|
alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ'
|
|
|
|
|
u'абвгдђежзијклљмнњопрстћуфхцчџш'),
|
|
|
|
|
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
|
|
|
|
'MacCyrillic', 'IBM855'],
|
|
|
|
|
wiki_start_pages=[u'Главна_страна']),
|
|
|
|
|
'Thai': Language(name='Thai',
|
|
|
|
|
iso_code='th',
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-11', 'TIS-620', 'CP874'],
|
|
|
|
|
alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛',
|
|
|
|
|
wiki_start_pages=[u'หน้าหลัก']),
|
|
|
|
|
'Turkish': Language(name='Turkish',
|
|
|
|
|
iso_code='tr',
|
|
|
|
|
# Q, W, and X are not used by Turkish
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
charsets=['ISO-8859-3', 'ISO-8859-9',
|
|
|
|
|
'WINDOWS-1254'],
|
|
|
|
|
alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû'
|
|
|
|
|
u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'),
|
|
|
|
|
wiki_start_pages=[u'Ana_Sayfa']),
|
|
|
|
|
'Vietnamese': Language(name='Vietnamese',
|
|
|
|
|
iso_code='vi',
|
|
|
|
|
use_ascii=False,
|
|
|
|
|
# Windows-1258 is the only common 8-bit
|
|
|
|
|
# Vietnamese encoding supported by Python.
|
|
|
|
|
# From Wikipedia:
|
|
|
|
|
# For systems that lack support for Unicode,
|
|
|
|
|
# dozens of 8-bit Vietnamese code pages are
|
|
|
|
|
# available.[1] The most common are VISCII
|
|
|
|
|
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
|
|
|
|
|
# Where ASCII is required, such as when
|
|
|
|
|
# ensuring readability in plain text e-mail,
|
|
|
|
|
# Vietnamese letters are often encoded
|
|
|
|
|
# according to Vietnamese Quoted-Readable
|
|
|
|
|
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
|
|
|
|
|
# though usage of either variable-width
|
|
|
|
|
# scheme has declined dramatically following
|
|
|
|
|
# the adoption of Unicode on the World Wide
|
|
|
|
|
# Web.
|
|
|
|
|
charsets=['WINDOWS-1258'],
|
|
|
|
|
alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy'
|
|
|
|
|
u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'),
|
|
|
|
|
wiki_start_pages=[u'Chữ_Quốc_ngữ']),
|
|
|
|
|
}
|