|
|
# -*- coding: utf-8 -*-
|
|
|
"""Beautiful Soup bonus library: Unicode, Dammit
|
|
|
|
|
|
This library converts a bytestream to Unicode through any means
|
|
|
necessary. It is heavily based on code from Mark Pilgrim's Universal
|
|
|
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
|
|
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
|
|
"""
|
|
|
# Use of this source code is governed by the MIT license.
|
|
|
__license__ = "MIT"
|
|
|
|
|
|
from html.entities import codepoint2name
|
|
|
from collections import defaultdict
|
|
|
import codecs
|
|
|
import re
|
|
|
import logging
|
|
|
import string
|
|
|
|
|
|
# Import a library to autodetect character encodings.
|
|
|
chardet_type = None
|
|
|
try:
|
|
|
# First try the fast C implementation.
|
|
|
# PyPI package: cchardet
|
|
|
import cchardet
|
|
|
def chardet_dammit(s):
|
|
|
if isinstance(s, str):
|
|
|
return None
|
|
|
return cchardet.detect(s)['encoding']
|
|
|
except ImportError:
|
|
|
try:
|
|
|
# Fall back to the pure Python implementation
|
|
|
# Debian package: python-chardet
|
|
|
# PyPI package: chardet
|
|
|
import chardet
|
|
|
def chardet_dammit(s):
|
|
|
if isinstance(s, str):
|
|
|
return None
|
|
|
return chardet.detect(s)['encoding']
|
|
|
#import chardet.constants
|
|
|
#chardet.constants._debug = 1
|
|
|
except ImportError:
|
|
|
# No chardet available.
|
|
|
def chardet_dammit(s):
|
|
|
return None
|
|
|
|
|
|
# Available from http://cjkpython.i18n.org/.
|
|
|
#
|
|
|
# TODO: This doesn't work anymore and the closest thing, iconv_codecs,
|
|
|
# is GPL-licensed. Check whether this is still necessary.
|
|
|
try:
|
|
|
import iconv_codec
|
|
|
except ImportError:
|
|
|
pass
|
|
|
|
|
|
# Build bytestring and Unicode versions of regular expressions for finding
|
|
|
# a declared encoding inside an XML or HTML document.
|
|
|
xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
|
|
|
html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
|
|
|
encoding_res = dict()
|
|
|
encoding_res[bytes] = {
|
|
|
'html' : re.compile(html_meta.encode("ascii"), re.I),
|
|
|
'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
|
|
|
}
|
|
|
encoding_res[str] = {
|
|
|
'html' : re.compile(html_meta, re.I),
|
|
|
'xml' : re.compile(xml_encoding, re.I)
|
|
|
}
|
|
|
|
|
|
try:
|
|
|
from html.entities import html5
|
|
|
except ImportError:
|
|
|
# This is a copy of html.entities.html5 from Python 3.9. There's
|
|
|
# no equivalent table in Python 2, so we'll just provide a copy
|
|
|
# here.
|
|
|
html5 = {
|
|
|
'Aacute': '\xc1',
|
|
|
'aacute': '\xe1',
|
|
|
'Aacute;': '\xc1',
|
|
|
'aacute;': '\xe1',
|
|
|
'Abreve;': '\u0102',
|
|
|
'abreve;': '\u0103',
|
|
|
'ac;': '\u223e',
|
|
|
'acd;': '\u223f',
|
|
|
'acE;': '\u223e\u0333',
|
|
|
'Acirc': '\xc2',
|
|
|
'acirc': '\xe2',
|
|
|
'Acirc;': '\xc2',
|
|
|
'acirc;': '\xe2',
|
|
|
'acute': '\xb4',
|
|
|
'acute;': '\xb4',
|
|
|
'Acy;': '\u0410',
|
|
|
'acy;': '\u0430',
|
|
|
'AElig': '\xc6',
|
|
|
'aelig': '\xe6',
|
|
|
'AElig;': '\xc6',
|
|
|
'aelig;': '\xe6',
|
|
|
'af;': '\u2061',
|
|
|
'Afr;': '\U0001d504',
|
|
|
'afr;': '\U0001d51e',
|
|
|
'Agrave': '\xc0',
|
|
|
'agrave': '\xe0',
|
|
|
'Agrave;': '\xc0',
|
|
|
'agrave;': '\xe0',
|
|
|
'alefsym;': '\u2135',
|
|
|
'aleph;': '\u2135',
|
|
|
'Alpha;': '\u0391',
|
|
|
'alpha;': '\u03b1',
|
|
|
'Amacr;': '\u0100',
|
|
|
'amacr;': '\u0101',
|
|
|
'amalg;': '\u2a3f',
|
|
|
'AMP': '&',
|
|
|
'amp': '&',
|
|
|
'AMP;': '&',
|
|
|
'amp;': '&',
|
|
|
'And;': '\u2a53',
|
|
|
'and;': '\u2227',
|
|
|
'andand;': '\u2a55',
|
|
|
'andd;': '\u2a5c',
|
|
|
'andslope;': '\u2a58',
|
|
|
'andv;': '\u2a5a',
|
|
|
'ang;': '\u2220',
|
|
|
'ange;': '\u29a4',
|
|
|
'angle;': '\u2220',
|
|
|
'angmsd;': '\u2221',
|
|
|
'angmsdaa;': '\u29a8',
|
|
|
'angmsdab;': '\u29a9',
|
|
|
'angmsdac;': '\u29aa',
|
|
|
'angmsdad;': '\u29ab',
|
|
|
'angmsdae;': '\u29ac',
|
|
|
'angmsdaf;': '\u29ad',
|
|
|
'angmsdag;': '\u29ae',
|
|
|
'angmsdah;': '\u29af',
|
|
|
'angrt;': '\u221f',
|
|
|
'angrtvb;': '\u22be',
|
|
|
'angrtvbd;': '\u299d',
|
|
|
'angsph;': '\u2222',
|
|
|
'angst;': '\xc5',
|
|
|
'angzarr;': '\u237c',
|
|
|
'Aogon;': '\u0104',
|
|
|
'aogon;': '\u0105',
|
|
|
'Aopf;': '\U0001d538',
|
|
|
'aopf;': '\U0001d552',
|
|
|
'ap;': '\u2248',
|
|
|
'apacir;': '\u2a6f',
|
|
|
'apE;': '\u2a70',
|
|
|
'ape;': '\u224a',
|
|
|
'apid;': '\u224b',
|
|
|
'apos;': "'",
|
|
|
'ApplyFunction;': '\u2061',
|
|
|
'approx;': '\u2248',
|
|
|
'approxeq;': '\u224a',
|
|
|
'Aring': '\xc5',
|
|
|
'aring': '\xe5',
|
|
|
'Aring;': '\xc5',
|
|
|
'aring;': '\xe5',
|
|
|
'Ascr;': '\U0001d49c',
|
|
|
'ascr;': '\U0001d4b6',
|
|
|
'Assign;': '\u2254',
|
|
|
'ast;': '*',
|
|
|
'asymp;': '\u2248',
|
|
|
'asympeq;': '\u224d',
|
|
|
'Atilde': '\xc3',
|
|
|
'atilde': '\xe3',
|
|
|
'Atilde;': '\xc3',
|
|
|
'atilde;': '\xe3',
|
|
|
'Auml': '\xc4',
|
|
|
'auml': '\xe4',
|
|
|
'Auml;': '\xc4',
|
|
|
'auml;': '\xe4',
|
|
|
'awconint;': '\u2233',
|
|
|
'awint;': '\u2a11',
|
|
|
'backcong;': '\u224c',
|
|
|
'backepsilon;': '\u03f6',
|
|
|
'backprime;': '\u2035',
|
|
|
'backsim;': '\u223d',
|
|
|
'backsimeq;': '\u22cd',
|
|
|
'Backslash;': '\u2216',
|
|
|
'Barv;': '\u2ae7',
|
|
|
'barvee;': '\u22bd',
|
|
|
'Barwed;': '\u2306',
|
|
|
'barwed;': '\u2305',
|
|
|
'barwedge;': '\u2305',
|
|
|
'bbrk;': '\u23b5',
|
|
|
'bbrktbrk;': '\u23b6',
|
|
|
'bcong;': '\u224c',
|
|
|
'Bcy;': '\u0411',
|
|
|
'bcy;': '\u0431',
|
|
|
'bdquo;': '\u201e',
|
|
|
'becaus;': '\u2235',
|
|
|
'Because;': '\u2235',
|
|
|
'because;': '\u2235',
|
|
|
'bemptyv;': '\u29b0',
|
|
|
'bepsi;': '\u03f6',
|
|
|
'bernou;': '\u212c',
|
|
|
'Bernoullis;': '\u212c',
|
|
|
'Beta;': '\u0392',
|
|
|
'beta;': '\u03b2',
|
|
|
'beth;': '\u2136',
|
|
|
'between;': '\u226c',
|
|
|
'Bfr;': '\U0001d505',
|
|
|
'bfr;': '\U0001d51f',
|
|
|
'bigcap;': '\u22c2',
|
|
|
'bigcirc;': '\u25ef',
|
|
|
'bigcup;': '\u22c3',
|
|
|
'bigodot;': '\u2a00',
|
|
|
'bigoplus;': '\u2a01',
|
|
|
'bigotimes;': '\u2a02',
|
|
|
'bigsqcup;': '\u2a06',
|
|
|
'bigstar;': '\u2605',
|
|
|
'bigtriangledown;': '\u25bd',
|
|
|
'bigtriangleup;': '\u25b3',
|
|
|
'biguplus;': '\u2a04',
|
|
|
'bigvee;': '\u22c1',
|
|
|
'bigwedge;': '\u22c0',
|
|
|
'bkarow;': '\u290d',
|
|
|
'blacklozenge;': '\u29eb',
|
|
|
'blacksquare;': '\u25aa',
|
|
|
'blacktriangle;': '\u25b4',
|
|
|
'blacktriangledown;': '\u25be',
|
|
|
'blacktriangleleft;': '\u25c2',
|
|
|
'blacktriangleright;': '\u25b8',
|
|
|
'blank;': '\u2423',
|
|
|
'blk12;': '\u2592',
|
|
|
'blk14;': '\u2591',
|
|
|
'blk34;': '\u2593',
|
|
|
'block;': '\u2588',
|
|
|
'bne;': '=\u20e5',
|
|
|
'bnequiv;': '\u2261\u20e5',
|
|
|
'bNot;': '\u2aed',
|
|
|
'bnot;': '\u2310',
|
|
|
'Bopf;': '\U0001d539',
|
|
|
'bopf;': '\U0001d553',
|
|
|
'bot;': '\u22a5',
|
|
|
'bottom;': '\u22a5',
|
|
|
'bowtie;': '\u22c8',
|
|
|
'boxbox;': '\u29c9',
|
|
|
'boxDL;': '\u2557',
|
|
|
'boxDl;': '\u2556',
|
|
|
'boxdL;': '\u2555',
|
|
|
'boxdl;': '\u2510',
|
|
|
'boxDR;': '\u2554',
|
|
|
'boxDr;': '\u2553',
|
|
|
'boxdR;': '\u2552',
|
|
|
'boxdr;': '\u250c',
|
|
|
'boxH;': '\u2550',
|
|
|
'boxh;': '\u2500',
|
|
|
'boxHD;': '\u2566',
|
|
|
'boxHd;': '\u2564',
|
|
|
'boxhD;': '\u2565',
|
|
|
'boxhd;': '\u252c',
|
|
|
'boxHU;': '\u2569',
|
|
|
'boxHu;': '\u2567',
|
|
|
'boxhU;': '\u2568',
|
|
|
'boxhu;': '\u2534',
|
|
|
'boxminus;': '\u229f',
|
|
|
'boxplus;': '\u229e',
|
|
|
'boxtimes;': '\u22a0',
|
|
|
'boxUL;': '\u255d',
|
|
|
'boxUl;': '\u255c',
|
|
|
'boxuL;': '\u255b',
|
|
|
'boxul;': '\u2518',
|
|
|
'boxUR;': '\u255a',
|
|
|
'boxUr;': '\u2559',
|
|
|
'boxuR;': '\u2558',
|
|
|
'boxur;': '\u2514',
|
|
|
'boxV;': '\u2551',
|
|
|
'boxv;': '\u2502',
|
|
|
'boxVH;': '\u256c',
|
|
|
'boxVh;': '\u256b',
|
|
|
'boxvH;': '\u256a',
|
|
|
'boxvh;': '\u253c',
|
|
|
'boxVL;': '\u2563',
|
|
|
'boxVl;': '\u2562',
|
|
|
'boxvL;': '\u2561',
|
|
|
'boxvl;': '\u2524',
|
|
|
'boxVR;': '\u2560',
|
|
|
'boxVr;': '\u255f',
|
|
|
'boxvR;': '\u255e',
|
|
|
'boxvr;': '\u251c',
|
|
|
'bprime;': '\u2035',
|
|
|
'Breve;': '\u02d8',
|
|
|
'breve;': '\u02d8',
|
|
|
'brvbar': '\xa6',
|
|
|
'brvbar;': '\xa6',
|
|
|
'Bscr;': '\u212c',
|
|
|
'bscr;': '\U0001d4b7',
|
|
|
'bsemi;': '\u204f',
|
|
|
'bsim;': '\u223d',
|
|
|
'bsime;': '\u22cd',
|
|
|
'bsol;': '\\',
|
|
|
'bsolb;': '\u29c5',
|
|
|
'bsolhsub;': '\u27c8',
|
|
|
'bull;': '\u2022',
|
|
|
'bullet;': '\u2022',
|
|
|
'bump;': '\u224e',
|
|
|
'bumpE;': '\u2aae',
|
|
|
'bumpe;': '\u224f',
|
|
|
'Bumpeq;': '\u224e',
|
|
|
'bumpeq;': '\u224f',
|
|
|
'Cacute;': '\u0106',
|
|
|
'cacute;': '\u0107',
|
|
|
'Cap;': '\u22d2',
|
|
|
'cap;': '\u2229',
|
|
|
'capand;': '\u2a44',
|
|
|
'capbrcup;': '\u2a49',
|
|
|
'capcap;': '\u2a4b',
|
|
|
'capcup;': '\u2a47',
|
|
|
'capdot;': '\u2a40',
|
|
|
'CapitalDifferentialD;': '\u2145',
|
|
|
'caps;': '\u2229\ufe00',
|
|
|
'caret;': '\u2041',
|
|
|
'caron;': '\u02c7',
|
|
|
'Cayleys;': '\u212d',
|
|
|
'ccaps;': '\u2a4d',
|
|
|
'Ccaron;': '\u010c',
|
|
|
'ccaron;': '\u010d',
|
|
|
'Ccedil': '\xc7',
|
|
|
'ccedil': '\xe7',
|
|
|
'Ccedil;': '\xc7',
|
|
|
'ccedil;': '\xe7',
|
|
|
'Ccirc;': '\u0108',
|
|
|
'ccirc;': '\u0109',
|
|
|
'Cconint;': '\u2230',
|
|
|
'ccups;': '\u2a4c',
|
|
|
'ccupssm;': '\u2a50',
|
|
|
'Cdot;': '\u010a',
|
|
|
'cdot;': '\u010b',
|
|
|
'cedil': '\xb8',
|
|
|
'cedil;': '\xb8',
|
|
|
'Cedilla;': '\xb8',
|
|
|
'cemptyv;': '\u29b2',
|
|
|
'cent': '\xa2',
|
|
|
'cent;': '\xa2',
|
|
|
'CenterDot;': '\xb7',
|
|
|
'centerdot;': '\xb7',
|
|
|
'Cfr;': '\u212d',
|
|
|
'cfr;': '\U0001d520',
|
|
|
'CHcy;': '\u0427',
|
|
|
'chcy;': '\u0447',
|
|
|
'check;': '\u2713',
|
|
|
'checkmark;': '\u2713',
|
|
|
'Chi;': '\u03a7',
|
|
|
'chi;': '\u03c7',
|
|
|
'cir;': '\u25cb',
|
|
|
'circ;': '\u02c6',
|
|
|
'circeq;': '\u2257',
|
|
|
'circlearrowleft;': '\u21ba',
|
|
|
'circlearrowright;': '\u21bb',
|
|
|
'circledast;': '\u229b',
|
|
|
'circledcirc;': '\u229a',
|
|
|
'circleddash;': '\u229d',
|
|
|
'CircleDot;': '\u2299',
|
|
|
'circledR;': '\xae',
|
|
|
'circledS;': '\u24c8',
|
|
|
'CircleMinus;': '\u2296',
|
|
|
'CirclePlus;': '\u2295',
|
|
|
'CircleTimes;': '\u2297',
|
|
|
'cirE;': '\u29c3',
|
|
|
'cire;': '\u2257',
|
|
|
'cirfnint;': '\u2a10',
|
|
|
'cirmid;': '\u2aef',
|
|
|
'cirscir;': '\u29c2',
|
|
|
'ClockwiseContourIntegral;': '\u2232',
|
|
|
'CloseCurlyDoubleQuote;': '\u201d',
|
|
|
'CloseCurlyQuote;': '\u2019',
|
|
|
'clubs;': '\u2663',
|
|
|
'clubsuit;': '\u2663',
|
|
|
'Colon;': '\u2237',
|
|
|
'colon;': ':',
|
|
|
'Colone;': '\u2a74',
|
|
|
'colone;': '\u2254',
|
|
|
'coloneq;': '\u2254',
|
|
|
'comma;': ',',
|
|
|
'commat;': '@',
|
|
|
'comp;': '\u2201',
|
|
|
'compfn;': '\u2218',
|
|
|
'complement;': '\u2201',
|
|
|
'complexes;': '\u2102',
|
|
|
'cong;': '\u2245',
|
|
|
'congdot;': '\u2a6d',
|
|
|
'Congruent;': '\u2261',
|
|
|
'Conint;': '\u222f',
|
|
|
'conint;': '\u222e',
|
|
|
'ContourIntegral;': '\u222e',
|
|
|
'Copf;': '\u2102',
|
|
|
'copf;': '\U0001d554',
|
|
|
'coprod;': '\u2210',
|
|
|
'Coproduct;': '\u2210',
|
|
|
'COPY': '\xa9',
|
|
|
'copy': '\xa9',
|
|
|
'COPY;': '\xa9',
|
|
|
'copy;': '\xa9',
|
|
|
'copysr;': '\u2117',
|
|
|
'CounterClockwiseContourIntegral;': '\u2233',
|
|
|
'crarr;': '\u21b5',
|
|
|
'Cross;': '\u2a2f',
|
|
|
'cross;': '\u2717',
|
|
|
'Cscr;': '\U0001d49e',
|
|
|
'cscr;': '\U0001d4b8',
|
|
|
'csub;': '\u2acf',
|
|
|
'csube;': '\u2ad1',
|
|
|
'csup;': '\u2ad0',
|
|
|
'csupe;': '\u2ad2',
|
|
|
'ctdot;': '\u22ef',
|
|
|
'cudarrl;': '\u2938',
|
|
|
'cudarrr;': '\u2935',
|
|
|
'cuepr;': '\u22de',
|
|
|
'cuesc;': '\u22df',
|
|
|
'cularr;': '\u21b6',
|
|
|
'cularrp;': '\u293d',
|
|
|
'Cup;': '\u22d3',
|
|
|
'cup;': '\u222a',
|
|
|
'cupbrcap;': '\u2a48',
|
|
|
'CupCap;': '\u224d',
|
|
|
'cupcap;': '\u2a46',
|
|
|
'cupcup;': '\u2a4a',
|
|
|
'cupdot;': '\u228d',
|
|
|
'cupor;': '\u2a45',
|
|
|
'cups;': '\u222a\ufe00',
|
|
|
'curarr;': '\u21b7',
|
|
|
'curarrm;': '\u293c',
|
|
|
'curlyeqprec;': '\u22de',
|
|
|
'curlyeqsucc;': '\u22df',
|
|
|
'curlyvee;': '\u22ce',
|
|
|
'curlywedge;': '\u22cf',
|
|
|
'curren': '\xa4',
|
|
|
'curren;': '\xa4',
|
|
|
'curvearrowleft;': '\u21b6',
|
|
|
'curvearrowright;': '\u21b7',
|
|
|
'cuvee;': '\u22ce',
|
|
|
'cuwed;': '\u22cf',
|
|
|
'cwconint;': '\u2232',
|
|
|
'cwint;': '\u2231',
|
|
|
'cylcty;': '\u232d',
|
|
|
'Dagger;': '\u2021',
|
|
|
'dagger;': '\u2020',
|
|
|
'daleth;': '\u2138',
|
|
|
'Darr;': '\u21a1',
|
|
|
'dArr;': '\u21d3',
|
|
|
'darr;': '\u2193',
|
|
|
'dash;': '\u2010',
|
|
|
'Dashv;': '\u2ae4',
|
|
|
'dashv;': '\u22a3',
|
|
|
'dbkarow;': '\u290f',
|
|
|
'dblac;': '\u02dd',
|
|
|
'Dcaron;': '\u010e',
|
|
|
'dcaron;': '\u010f',
|
|
|
'Dcy;': '\u0414',
|
|
|
'dcy;': '\u0434',
|
|
|
'DD;': '\u2145',
|
|
|
'dd;': '\u2146',
|
|
|
'ddagger;': '\u2021',
|
|
|
'ddarr;': '\u21ca',
|
|
|
'DDotrahd;': '\u2911',
|
|
|
'ddotseq;': '\u2a77',
|
|
|
'deg': '\xb0',
|
|
|
'deg;': '\xb0',
|
|
|
'Del;': '\u2207',
|
|
|
'Delta;': '\u0394',
|
|
|
'delta;': '\u03b4',
|
|
|
'demptyv;': '\u29b1',
|
|
|
'dfisht;': '\u297f',
|
|
|
'Dfr;': '\U0001d507',
|
|
|
'dfr;': '\U0001d521',
|
|
|
'dHar;': '\u2965',
|
|
|
'dharl;': '\u21c3',
|
|
|
'dharr;': '\u21c2',
|
|
|
'DiacriticalAcute;': '\xb4',
|
|
|
'DiacriticalDot;': '\u02d9',
|
|
|
'DiacriticalDoubleAcute;': '\u02dd',
|
|
|
'DiacriticalGrave;': '`',
|
|
|
'DiacriticalTilde;': '\u02dc',
|
|
|
'diam;': '\u22c4',
|
|
|
'Diamond;': '\u22c4',
|
|
|
'diamond;': '\u22c4',
|
|
|
'diamondsuit;': '\u2666',
|
|
|
'diams;': '\u2666',
|
|
|
'die;': '\xa8',
|
|
|
'DifferentialD;': '\u2146',
|
|
|
'digamma;': '\u03dd',
|
|
|
'disin;': '\u22f2',
|
|
|
'div;': '\xf7',
|
|
|
'divide': '\xf7',
|
|
|
'divide;': '\xf7',
|
|
|
'divideontimes;': '\u22c7',
|
|
|
'divonx;': '\u22c7',
|
|
|
'DJcy;': '\u0402',
|
|
|
'djcy;': '\u0452',
|
|
|
'dlcorn;': '\u231e',
|
|
|
'dlcrop;': '\u230d',
|
|
|
'dollar;': '$',
|
|
|
'Dopf;': '\U0001d53b',
|
|
|
'dopf;': '\U0001d555',
|
|
|
'Dot;': '\xa8',
|
|
|
'dot;': '\u02d9',
|
|
|
'DotDot;': '\u20dc',
|
|
|
'doteq;': '\u2250',
|
|
|
'doteqdot;': '\u2251',
|
|
|
'DotEqual;': '\u2250',
|
|
|
'dotminus;': '\u2238',
|
|
|
'dotplus;': '\u2214',
|
|
|
'dotsquare;': '\u22a1',
|
|
|
'doublebarwedge;': '\u2306',
|
|
|
'DoubleContourIntegral;': '\u222f',
|
|
|
'DoubleDot;': '\xa8',
|
|
|
'DoubleDownArrow;': '\u21d3',
|
|
|
'DoubleLeftArrow;': '\u21d0',
|
|
|
'DoubleLeftRightArrow;': '\u21d4',
|
|
|
'DoubleLeftTee;': '\u2ae4',
|
|
|
'DoubleLongLeftArrow;': '\u27f8',
|
|
|
'DoubleLongLeftRightArrow;': '\u27fa',
|
|
|
'DoubleLongRightArrow;': '\u27f9',
|
|
|
'DoubleRightArrow;': '\u21d2',
|
|
|
'DoubleRightTee;': '\u22a8',
|
|
|
'DoubleUpArrow;': '\u21d1',
|
|
|
'DoubleUpDownArrow;': '\u21d5',
|
|
|
'DoubleVerticalBar;': '\u2225',
|
|
|
'DownArrow;': '\u2193',
|
|
|
'Downarrow;': '\u21d3',
|
|
|
'downarrow;': '\u2193',
|
|
|
'DownArrowBar;': '\u2913',
|
|
|
'DownArrowUpArrow;': '\u21f5',
|
|
|
'DownBreve;': '\u0311',
|
|
|
'downdownarrows;': '\u21ca',
|
|
|
'downharpoonleft;': '\u21c3',
|
|
|
'downharpoonright;': '\u21c2',
|
|
|
'DownLeftRightVector;': '\u2950',
|
|
|
'DownLeftTeeVector;': '\u295e',
|
|
|
'DownLeftVector;': '\u21bd',
|
|
|
'DownLeftVectorBar;': '\u2956',
|
|
|
'DownRightTeeVector;': '\u295f',
|
|
|
'DownRightVector;': '\u21c1',
|
|
|
'DownRightVectorBar;': '\u2957',
|
|
|
'DownTee;': '\u22a4',
|
|
|
'DownTeeArrow;': '\u21a7',
|
|
|
'drbkarow;': '\u2910',
|
|
|
'drcorn;': '\u231f',
|
|
|
'drcrop;': '\u230c',
|
|
|
'Dscr;': '\U0001d49f',
|
|
|
'dscr;': '\U0001d4b9',
|
|
|
'DScy;': '\u0405',
|
|
|
'dscy;': '\u0455',
|
|
|
'dsol;': '\u29f6',
|
|
|
'Dstrok;': '\u0110',
|
|
|
'dstrok;': '\u0111',
|
|
|
'dtdot;': '\u22f1',
|
|
|
'dtri;': '\u25bf',
|
|
|
'dtrif;': '\u25be',
|
|
|
'duarr;': '\u21f5',
|
|
|
'duhar;': '\u296f',
|
|
|
'dwangle;': '\u29a6',
|
|
|
'DZcy;': '\u040f',
|
|
|
'dzcy;': '\u045f',
|
|
|
'dzigrarr;': '\u27ff',
|
|
|
'Eacute': '\xc9',
|
|
|
'eacute': '\xe9',
|
|
|
'Eacute;': '\xc9',
|
|
|
'eacute;': '\xe9',
|
|
|
'easter;': '\u2a6e',
|
|
|
'Ecaron;': '\u011a',
|
|
|
'ecaron;': '\u011b',
|
|
|
'ecir;': '\u2256',
|
|
|
'Ecirc': '\xca',
|
|
|
'ecirc': '\xea',
|
|
|
'Ecirc;': '\xca',
|
|
|
'ecirc;': '\xea',
|
|
|
'ecolon;': '\u2255',
|
|
|
'Ecy;': '\u042d',
|
|
|
'ecy;': '\u044d',
|
|
|
'eDDot;': '\u2a77',
|
|
|
'Edot;': '\u0116',
|
|
|
'eDot;': '\u2251',
|
|
|
'edot;': '\u0117',
|
|
|
'ee;': '\u2147',
|
|
|
'efDot;': '\u2252',
|
|
|
'Efr;': '\U0001d508',
|
|
|
'efr;': '\U0001d522',
|
|
|
'eg;': '\u2a9a',
|
|
|
'Egrave': '\xc8',
|
|
|
'egrave': '\xe8',
|
|
|
'Egrave;': '\xc8',
|
|
|
'egrave;': '\xe8',
|
|
|
'egs;': '\u2a96',
|
|
|
'egsdot;': '\u2a98',
|
|
|
'el;': '\u2a99',
|
|
|
'Element;': '\u2208',
|
|
|
'elinters;': '\u23e7',
|
|
|
'ell;': '\u2113',
|
|
|
'els;': '\u2a95',
|
|
|
'elsdot;': '\u2a97',
|
|
|
'Emacr;': '\u0112',
|
|
|
'emacr;': '\u0113',
|
|
|
'empty;': '\u2205',
|
|
|
'emptyset;': '\u2205',
|
|
|
'EmptySmallSquare;': '\u25fb',
|
|
|
'emptyv;': '\u2205',
|
|
|
'EmptyVerySmallSquare;': '\u25ab',
|
|
|
'emsp13;': '\u2004',
|
|
|
'emsp14;': '\u2005',
|
|
|
'emsp;': '\u2003',
|
|
|
'ENG;': '\u014a',
|
|
|
'eng;': '\u014b',
|
|
|
'ensp;': '\u2002',
|
|
|
'Eogon;': '\u0118',
|
|
|
'eogon;': '\u0119',
|
|
|
'Eopf;': '\U0001d53c',
|
|
|
'eopf;': '\U0001d556',
|
|
|
'epar;': '\u22d5',
|
|
|
'eparsl;': '\u29e3',
|
|
|
'eplus;': '\u2a71',
|
|
|
'epsi;': '\u03b5',
|
|
|
'Epsilon;': '\u0395',
|
|
|
'epsilon;': '\u03b5',
|
|
|
'epsiv;': '\u03f5',
|
|
|
'eqcirc;': '\u2256',
|
|
|
'eqcolon;': '\u2255',
|
|
|
'eqsim;': '\u2242',
|
|
|
'eqslantgtr;': '\u2a96',
|
|
|
'eqslantless;': '\u2a95',
|
|
|
'Equal;': '\u2a75',
|
|
|
'equals;': '=',
|
|
|
'EqualTilde;': '\u2242',
|
|
|
'equest;': '\u225f',
|
|
|
'Equilibrium;': '\u21cc',
|
|
|
'equiv;': '\u2261',
|
|
|
'equivDD;': '\u2a78',
|
|
|
'eqvparsl;': '\u29e5',
|
|
|
'erarr;': '\u2971',
|
|
|
'erDot;': '\u2253',
|
|
|
'Escr;': '\u2130',
|
|
|
'escr;': '\u212f',
|
|
|
'esdot;': '\u2250',
|
|
|
'Esim;': '\u2a73',
|
|
|
'esim;': '\u2242',
|
|
|
'Eta;': '\u0397',
|
|
|
'eta;': '\u03b7',
|
|
|
'ETH': '\xd0',
|
|
|
'eth': '\xf0',
|
|
|
'ETH;': '\xd0',
|
|
|
'eth;': '\xf0',
|
|
|
'Euml': '\xcb',
|
|
|
'euml': '\xeb',
|
|
|
'Euml;': '\xcb',
|
|
|
'euml;': '\xeb',
|
|
|
'euro;': '\u20ac',
|
|
|
'excl;': '!',
|
|
|
'exist;': '\u2203',
|
|
|
'Exists;': '\u2203',
|
|
|
'expectation;': '\u2130',
|
|
|
'ExponentialE;': '\u2147',
|
|
|
'exponentiale;': '\u2147',
|
|
|
'fallingdotseq;': '\u2252',
|
|
|
'Fcy;': '\u0424',
|
|
|
'fcy;': '\u0444',
|
|
|
'female;': '\u2640',
|
|
|
'ffilig;': '\ufb03',
|
|
|
'fflig;': '\ufb00',
|
|
|
'ffllig;': '\ufb04',
|
|
|
'Ffr;': '\U0001d509',
|
|
|
'ffr;': '\U0001d523',
|
|
|
'filig;': '\ufb01',
|
|
|
'FilledSmallSquare;': '\u25fc',
|
|
|
'FilledVerySmallSquare;': '\u25aa',
|
|
|
'fjlig;': 'fj',
|
|
|
'flat;': '\u266d',
|
|
|
'fllig;': '\ufb02',
|
|
|
'fltns;': '\u25b1',
|
|
|
'fnof;': '\u0192',
|
|
|
'Fopf;': '\U0001d53d',
|
|
|
'fopf;': '\U0001d557',
|
|
|
'ForAll;': '\u2200',
|
|
|
'forall;': '\u2200',
|
|
|
'fork;': '\u22d4',
|
|
|
'forkv;': '\u2ad9',
|
|
|
'Fouriertrf;': '\u2131',
|
|
|
'fpartint;': '\u2a0d',
|
|
|
'frac12': '\xbd',
|
|
|
'frac12;': '\xbd',
|
|
|
'frac13;': '\u2153',
|
|
|
'frac14': '\xbc',
|
|
|
'frac14;': '\xbc',
|
|
|
'frac15;': '\u2155',
|
|
|
'frac16;': '\u2159',
|
|
|
'frac18;': '\u215b',
|
|
|
'frac23;': '\u2154',
|
|
|
'frac25;': '\u2156',
|
|
|
'frac34': '\xbe',
|
|
|
'frac34;': '\xbe',
|
|
|
'frac35;': '\u2157',
|
|
|
'frac38;': '\u215c',
|
|
|
'frac45;': '\u2158',
|
|
|
'frac56;': '\u215a',
|
|
|
'frac58;': '\u215d',
|
|
|
'frac78;': '\u215e',
|
|
|
'frasl;': '\u2044',
|
|
|
'frown;': '\u2322',
|
|
|
'Fscr;': '\u2131',
|
|
|
'fscr;': '\U0001d4bb',
|
|
|
'gacute;': '\u01f5',
|
|
|
'Gamma;': '\u0393',
|
|
|
'gamma;': '\u03b3',
|
|
|
'Gammad;': '\u03dc',
|
|
|
'gammad;': '\u03dd',
|
|
|
'gap;': '\u2a86',
|
|
|
'Gbreve;': '\u011e',
|
|
|
'gbreve;': '\u011f',
|
|
|
'Gcedil;': '\u0122',
|
|
|
'Gcirc;': '\u011c',
|
|
|
'gcirc;': '\u011d',
|
|
|
'Gcy;': '\u0413',
|
|
|
'gcy;': '\u0433',
|
|
|
'Gdot;': '\u0120',
|
|
|
'gdot;': '\u0121',
|
|
|
'gE;': '\u2267',
|
|
|
'ge;': '\u2265',
|
|
|
'gEl;': '\u2a8c',
|
|
|
'gel;': '\u22db',
|
|
|
'geq;': '\u2265',
|
|
|
'geqq;': '\u2267',
|
|
|
'geqslant;': '\u2a7e',
|
|
|
'ges;': '\u2a7e',
|
|
|
'gescc;': '\u2aa9',
|
|
|
'gesdot;': '\u2a80',
|
|
|
'gesdoto;': '\u2a82',
|
|
|
'gesdotol;': '\u2a84',
|
|
|
'gesl;': '\u22db\ufe00',
|
|
|
'gesles;': '\u2a94',
|
|
|
'Gfr;': '\U0001d50a',
|
|
|
'gfr;': '\U0001d524',
|
|
|
'Gg;': '\u22d9',
|
|
|
'gg;': '\u226b',
|
|
|
'ggg;': '\u22d9',
|
|
|
'gimel;': '\u2137',
|
|
|
'GJcy;': '\u0403',
|
|
|
'gjcy;': '\u0453',
|
|
|
'gl;': '\u2277',
|
|
|
'gla;': '\u2aa5',
|
|
|
'glE;': '\u2a92',
|
|
|
'glj;': '\u2aa4',
|
|
|
'gnap;': '\u2a8a',
|
|
|
'gnapprox;': '\u2a8a',
|
|
|
'gnE;': '\u2269',
|
|
|
'gne;': '\u2a88',
|
|
|
'gneq;': '\u2a88',
|
|
|
'gneqq;': '\u2269',
|
|
|
'gnsim;': '\u22e7',
|
|
|
'Gopf;': '\U0001d53e',
|
|
|
'gopf;': '\U0001d558',
|
|
|
'grave;': '`',
|
|
|
'GreaterEqual;': '\u2265',
|
|
|
'GreaterEqualLess;': '\u22db',
|
|
|
'GreaterFullEqual;': '\u2267',
|
|
|
'GreaterGreater;': '\u2aa2',
|
|
|
'GreaterLess;': '\u2277',
|
|
|
'GreaterSlantEqual;': '\u2a7e',
|
|
|
'GreaterTilde;': '\u2273',
|
|
|
'Gscr;': '\U0001d4a2',
|
|
|
'gscr;': '\u210a',
|
|
|
'gsim;': '\u2273',
|
|
|
'gsime;': '\u2a8e',
|
|
|
'gsiml;': '\u2a90',
|
|
|
'GT': '>',
|
|
|
'gt': '>',
|
|
|
'GT;': '>',
|
|
|
'Gt;': '\u226b',
|
|
|
'gt;': '>',
|
|
|
'gtcc;': '\u2aa7',
|
|
|
'gtcir;': '\u2a7a',
|
|
|
'gtdot;': '\u22d7',
|
|
|
'gtlPar;': '\u2995',
|
|
|
'gtquest;': '\u2a7c',
|
|
|
'gtrapprox;': '\u2a86',
|
|
|
'gtrarr;': '\u2978',
|
|
|
'gtrdot;': '\u22d7',
|
|
|
'gtreqless;': '\u22db',
|
|
|
'gtreqqless;': '\u2a8c',
|
|
|
'gtrless;': '\u2277',
|
|
|
'gtrsim;': '\u2273',
|
|
|
'gvertneqq;': '\u2269\ufe00',
|
|
|
'gvnE;': '\u2269\ufe00',
|
|
|
'Hacek;': '\u02c7',
|
|
|
'hairsp;': '\u200a',
|
|
|
'half;': '\xbd',
|
|
|
'hamilt;': '\u210b',
|
|
|
'HARDcy;': '\u042a',
|
|
|
'hardcy;': '\u044a',
|
|
|
'hArr;': '\u21d4',
|
|
|
'harr;': '\u2194',
|
|
|
'harrcir;': '\u2948',
|
|
|
'harrw;': '\u21ad',
|
|
|
'Hat;': '^',
|
|
|
'hbar;': '\u210f',
|
|
|
'Hcirc;': '\u0124',
|
|
|
'hcirc;': '\u0125',
|
|
|
'hearts;': '\u2665',
|
|
|
'heartsuit;': '\u2665',
|
|
|
'hellip;': '\u2026',
|
|
|
'hercon;': '\u22b9',
|
|
|
'Hfr;': '\u210c',
|
|
|
'hfr;': '\U0001d525',
|
|
|
'HilbertSpace;': '\u210b',
|
|
|
'hksearow;': '\u2925',
|
|
|
'hkswarow;': '\u2926',
|
|
|
'hoarr;': '\u21ff',
|
|
|
'homtht;': '\u223b',
|
|
|
'hookleftarrow;': '\u21a9',
|
|
|
'hookrightarrow;': '\u21aa',
|
|
|
'Hopf;': '\u210d',
|
|
|
'hopf;': '\U0001d559',
|
|
|
'horbar;': '\u2015',
|
|
|
'HorizontalLine;': '\u2500',
|
|
|
'Hscr;': '\u210b',
|
|
|
'hscr;': '\U0001d4bd',
|
|
|
'hslash;': '\u210f',
|
|
|
'Hstrok;': '\u0126',
|
|
|
'hstrok;': '\u0127',
|
|
|
'HumpDownHump;': '\u224e',
|
|
|
'HumpEqual;': '\u224f',
|
|
|
'hybull;': '\u2043',
|
|
|
'hyphen;': '\u2010',
|
|
|
'Iacute': '\xcd',
|
|
|
'iacute': '\xed',
|
|
|
'Iacute;': '\xcd',
|
|
|
'iacute;': '\xed',
|
|
|
'ic;': '\u2063',
|
|
|
'Icirc': '\xce',
|
|
|
'icirc': '\xee',
|
|
|
'Icirc;': '\xce',
|
|
|
'icirc;': '\xee',
|
|
|
'Icy;': '\u0418',
|
|
|
'icy;': '\u0438',
|
|
|
'Idot;': '\u0130',
|
|
|
'IEcy;': '\u0415',
|
|
|
'iecy;': '\u0435',
|
|
|
'iexcl': '\xa1',
|
|
|
'iexcl;': '\xa1',
|
|
|
'iff;': '\u21d4',
|
|
|
'Ifr;': '\u2111',
|
|
|
'ifr;': '\U0001d526',
|
|
|
'Igrave': '\xcc',
|
|
|
'igrave': '\xec',
|
|
|
'Igrave;': '\xcc',
|
|
|
'igrave;': '\xec',
|
|
|
'ii;': '\u2148',
|
|
|
'iiiint;': '\u2a0c',
|
|
|
'iiint;': '\u222d',
|
|
|
'iinfin;': '\u29dc',
|
|
|
'iiota;': '\u2129',
|
|
|
'IJlig;': '\u0132',
|
|
|
'ijlig;': '\u0133',
|
|
|
'Im;': '\u2111',
|
|
|
'Imacr;': '\u012a',
|
|
|
'imacr;': '\u012b',
|
|
|
'image;': '\u2111',
|
|
|
'ImaginaryI;': '\u2148',
|
|
|
'imagline;': '\u2110',
|
|
|
'imagpart;': '\u2111',
|
|
|
'imath;': '\u0131',
|
|
|
'imof;': '\u22b7',
|
|
|
'imped;': '\u01b5',
|
|
|
'Implies;': '\u21d2',
|
|
|
'in;': '\u2208',
|
|
|
'incare;': '\u2105',
|
|
|
'infin;': '\u221e',
|
|
|
'infintie;': '\u29dd',
|
|
|
'inodot;': '\u0131',
|
|
|
'Int;': '\u222c',
|
|
|
'int;': '\u222b',
|
|
|
'intcal;': '\u22ba',
|
|
|
'integers;': '\u2124',
|
|
|
'Integral;': '\u222b',
|
|
|
'intercal;': '\u22ba',
|
|
|
'Intersection;': '\u22c2',
|
|
|
'intlarhk;': '\u2a17',
|
|
|
'intprod;': '\u2a3c',
|
|
|
'InvisibleComma;': '\u2063',
|
|
|
'InvisibleTimes;': '\u2062',
|
|
|
'IOcy;': '\u0401',
|
|
|
'iocy;': '\u0451',
|
|
|
'Iogon;': '\u012e',
|
|
|
'iogon;': '\u012f',
|
|
|
'Iopf;': '\U0001d540',
|
|
|
'iopf;': '\U0001d55a',
|
|
|
'Iota;': '\u0399',
|
|
|
'iota;': '\u03b9',
|
|
|
'iprod;': '\u2a3c',
|
|
|
'iquest': '\xbf',
|
|
|
'iquest;': '\xbf',
|
|
|
'Iscr;': '\u2110',
|
|
|
'iscr;': '\U0001d4be',
|
|
|
'isin;': '\u2208',
|
|
|
'isindot;': '\u22f5',
|
|
|
'isinE;': '\u22f9',
|
|
|
'isins;': '\u22f4',
|
|
|
'isinsv;': '\u22f3',
|
|
|
'isinv;': '\u2208',
|
|
|
'it;': '\u2062',
|
|
|
'Itilde;': '\u0128',
|
|
|
'itilde;': '\u0129',
|
|
|
'Iukcy;': '\u0406',
|
|
|
'iukcy;': '\u0456',
|
|
|
'Iuml': '\xcf',
|
|
|
'iuml': '\xef',
|
|
|
'Iuml;': '\xcf',
|
|
|
'iuml;': '\xef',
|
|
|
'Jcirc;': '\u0134',
|
|
|
'jcirc;': '\u0135',
|
|
|
'Jcy;': '\u0419',
|
|
|
'jcy;': '\u0439',
|
|
|
'Jfr;': '\U0001d50d',
|
|
|
'jfr;': '\U0001d527',
|
|
|
'jmath;': '\u0237',
|
|
|
'Jopf;': '\U0001d541',
|
|
|
'jopf;': '\U0001d55b',
|
|
|
'Jscr;': '\U0001d4a5',
|
|
|
'jscr;': '\U0001d4bf',
|
|
|
'Jsercy;': '\u0408',
|
|
|
'jsercy;': '\u0458',
|
|
|
'Jukcy;': '\u0404',
|
|
|
'jukcy;': '\u0454',
|
|
|
'Kappa;': '\u039a',
|
|
|
'kappa;': '\u03ba',
|
|
|
'kappav;': '\u03f0',
|
|
|
'Kcedil;': '\u0136',
|
|
|
'kcedil;': '\u0137',
|
|
|
'Kcy;': '\u041a',
|
|
|
'kcy;': '\u043a',
|
|
|
'Kfr;': '\U0001d50e',
|
|
|
'kfr;': '\U0001d528',
|
|
|
'kgreen;': '\u0138',
|
|
|
'KHcy;': '\u0425',
|
|
|
'khcy;': '\u0445',
|
|
|
'KJcy;': '\u040c',
|
|
|
'kjcy;': '\u045c',
|
|
|
'Kopf;': '\U0001d542',
|
|
|
'kopf;': '\U0001d55c',
|
|
|
'Kscr;': '\U0001d4a6',
|
|
|
'kscr;': '\U0001d4c0',
|
|
|
'lAarr;': '\u21da',
|
|
|
'Lacute;': '\u0139',
|
|
|
'lacute;': '\u013a',
|
|
|
'laemptyv;': '\u29b4',
|
|
|
'lagran;': '\u2112',
|
|
|
'Lambda;': '\u039b',
|
|
|
'lambda;': '\u03bb',
|
|
|
'Lang;': '\u27ea',
|
|
|
'lang;': '\u27e8',
|
|
|
'langd;': '\u2991',
|
|
|
'langle;': '\u27e8',
|
|
|
'lap;': '\u2a85',
|
|
|
'Laplacetrf;': '\u2112',
|
|
|
'laquo': '\xab',
|
|
|
'laquo;': '\xab',
|
|
|
'Larr;': '\u219e',
|
|
|
'lArr;': '\u21d0',
|
|
|
'larr;': '\u2190',
|
|
|
'larrb;': '\u21e4',
|
|
|
'larrbfs;': '\u291f',
|
|
|
'larrfs;': '\u291d',
|
|
|
'larrhk;': '\u21a9',
|
|
|
'larrlp;': '\u21ab',
|
|
|
'larrpl;': '\u2939',
|
|
|
'larrsim;': '\u2973',
|
|
|
'larrtl;': '\u21a2',
|
|
|
'lat;': '\u2aab',
|
|
|
'lAtail;': '\u291b',
|
|
|
'latail;': '\u2919',
|
|
|
'late;': '\u2aad',
|
|
|
'lates;': '\u2aad\ufe00',
|
|
|
'lBarr;': '\u290e',
|
|
|
'lbarr;': '\u290c',
|
|
|
'lbbrk;': '\u2772',
|
|
|
'lbrace;': '{',
|
|
|
'lbrack;': '[',
|
|
|
'lbrke;': '\u298b',
|
|
|
'lbrksld;': '\u298f',
|
|
|
'lbrkslu;': '\u298d',
|
|
|
'Lcaron;': '\u013d',
|
|
|
'lcaron;': '\u013e',
|
|
|
'Lcedil;': '\u013b',
|
|
|
'lcedil;': '\u013c',
|
|
|
'lceil;': '\u2308',
|
|
|
'lcub;': '{',
|
|
|
'Lcy;': '\u041b',
|
|
|
'lcy;': '\u043b',
|
|
|
'ldca;': '\u2936',
|
|
|
'ldquo;': '\u201c',
|
|
|
'ldquor;': '\u201e',
|
|
|
'ldrdhar;': '\u2967',
|
|
|
'ldrushar;': '\u294b',
|
|
|
'ldsh;': '\u21b2',
|
|
|
'lE;': '\u2266',
|
|
|
'le;': '\u2264',
|
|
|
'LeftAngleBracket;': '\u27e8',
|
|
|
'LeftArrow;': '\u2190',
|
|
|
'Leftarrow;': '\u21d0',
|
|
|
'leftarrow;': '\u2190',
|
|
|
'LeftArrowBar;': '\u21e4',
|
|
|
'LeftArrowRightArrow;': '\u21c6',
|
|
|
'leftarrowtail;': '\u21a2',
|
|
|
'LeftCeiling;': '\u2308',
|
|
|
'LeftDoubleBracket;': '\u27e6',
|
|
|
'LeftDownTeeVector;': '\u2961',
|
|
|
'LeftDownVector;': '\u21c3',
|
|
|
'LeftDownVectorBar;': '\u2959',
|
|
|
'LeftFloor;': '\u230a',
|
|
|
'leftharpoondown;': '\u21bd',
|
|
|
'leftharpoonup;': '\u21bc',
|
|
|
'leftleftarrows;': '\u21c7',
|
|
|
'LeftRightArrow;': '\u2194',
|
|
|
'Leftrightarrow;': '\u21d4',
|
|
|
'leftrightarrow;': '\u2194',
|
|
|
'leftrightarrows;': '\u21c6',
|
|
|
'leftrightharpoons;': '\u21cb',
|
|
|
'leftrightsquigarrow;': '\u21ad',
|
|
|
'LeftRightVector;': '\u294e',
|
|
|
'LeftTee;': '\u22a3',
|
|
|
'LeftTeeArrow;': '\u21a4',
|
|
|
'LeftTeeVector;': '\u295a',
|
|
|
'leftthreetimes;': '\u22cb',
|
|
|
'LeftTriangle;': '\u22b2',
|
|
|
'LeftTriangleBar;': '\u29cf',
|
|
|
'LeftTriangleEqual;': '\u22b4',
|
|
|
'LeftUpDownVector;': '\u2951',
|
|
|
'LeftUpTeeVector;': '\u2960',
|
|
|
'LeftUpVector;': '\u21bf',
|
|
|
'LeftUpVectorBar;': '\u2958',
|
|
|
'LeftVector;': '\u21bc',
|
|
|
'LeftVectorBar;': '\u2952',
|
|
|
'lEg;': '\u2a8b',
|
|
|
'leg;': '\u22da',
|
|
|
'leq;': '\u2264',
|
|
|
'leqq;': '\u2266',
|
|
|
'leqslant;': '\u2a7d',
|
|
|
'les;': '\u2a7d',
|
|
|
'lescc;': '\u2aa8',
|
|
|
'lesdot;': '\u2a7f',
|
|
|
'lesdoto;': '\u2a81',
|
|
|
'lesdotor;': '\u2a83',
|
|
|
'lesg;': '\u22da\ufe00',
|
|
|
'lesges;': '\u2a93',
|
|
|
'lessapprox;': '\u2a85',
|
|
|
'lessdot;': '\u22d6',
|
|
|
'lesseqgtr;': '\u22da',
|
|
|
'lesseqqgtr;': '\u2a8b',
|
|
|
'LessEqualGreater;': '\u22da',
|
|
|
'LessFullEqual;': '\u2266',
|
|
|
'LessGreater;': '\u2276',
|
|
|
'lessgtr;': '\u2276',
|
|
|
'LessLess;': '\u2aa1',
|
|
|
'lesssim;': '\u2272',
|
|
|
'LessSlantEqual;': '\u2a7d',
|
|
|
'LessTilde;': '\u2272',
|
|
|
'lfisht;': '\u297c',
|
|
|
'lfloor;': '\u230a',
|
|
|
'Lfr;': '\U0001d50f',
|
|
|
'lfr;': '\U0001d529',
|
|
|
'lg;': '\u2276',
|
|
|
'lgE;': '\u2a91',
|
|
|
'lHar;': '\u2962',
|
|
|
'lhard;': '\u21bd',
|
|
|
'lharu;': '\u21bc',
|
|
|
'lharul;': '\u296a',
|
|
|
'lhblk;': '\u2584',
|
|
|
'LJcy;': '\u0409',
|
|
|
'ljcy;': '\u0459',
|
|
|
'Ll;': '\u22d8',
|
|
|
'll;': '\u226a',
|
|
|
'llarr;': '\u21c7',
|
|
|
'llcorner;': '\u231e',
|
|
|
'Lleftarrow;': '\u21da',
|
|
|
'llhard;': '\u296b',
|
|
|
'lltri;': '\u25fa',
|
|
|
'Lmidot;': '\u013f',
|
|
|
'lmidot;': '\u0140',
|
|
|
'lmoust;': '\u23b0',
|
|
|
'lmoustache;': '\u23b0',
|
|
|
'lnap;': '\u2a89',
|
|
|
'lnapprox;': '\u2a89',
|
|
|
'lnE;': '\u2268',
|
|
|
'lne;': '\u2a87',
|
|
|
'lneq;': '\u2a87',
|
|
|
'lneqq;': '\u2268',
|
|
|
'lnsim;': '\u22e6',
|
|
|
'loang;': '\u27ec',
|
|
|
'loarr;': '\u21fd',
|
|
|
'lobrk;': '\u27e6',
|
|
|
'LongLeftArrow;': '\u27f5',
|
|
|
'Longleftarrow;': '\u27f8',
|
|
|
'longleftarrow;': '\u27f5',
|
|
|
'LongLeftRightArrow;': '\u27f7',
|
|
|
'Longleftrightarrow;': '\u27fa',
|
|
|
'longleftrightarrow;': '\u27f7',
|
|
|
'longmapsto;': '\u27fc',
|
|
|
'LongRightArrow;': '\u27f6',
|
|
|
'Longrightarrow;': '\u27f9',
|
|
|
'longrightarrow;': '\u27f6',
|
|
|
'looparrowleft;': '\u21ab',
|
|
|
'looparrowright;': '\u21ac',
|
|
|
'lopar;': '\u2985',
|
|
|
'Lopf;': '\U0001d543',
|
|
|
'lopf;': '\U0001d55d',
|
|
|
'loplus;': '\u2a2d',
|
|
|
'lotimes;': '\u2a34',
|
|
|
'lowast;': '\u2217',
|
|
|
'lowbar;': '_',
|
|
|
'LowerLeftArrow;': '\u2199',
|
|
|
'LowerRightArrow;': '\u2198',
|
|
|
'loz;': '\u25ca',
|
|
|
'lozenge;': '\u25ca',
|
|
|
'lozf;': '\u29eb',
|
|
|
'lpar;': '(',
|
|
|
'lparlt;': '\u2993',
|
|
|
'lrarr;': '\u21c6',
|
|
|
'lrcorner;': '\u231f',
|
|
|
'lrhar;': '\u21cb',
|
|
|
'lrhard;': '\u296d',
|
|
|
'lrm;': '\u200e',
|
|
|
'lrtri;': '\u22bf',
|
|
|
'lsaquo;': '\u2039',
|
|
|
'Lscr;': '\u2112',
|
|
|
'lscr;': '\U0001d4c1',
|
|
|
'Lsh;': '\u21b0',
|
|
|
'lsh;': '\u21b0',
|
|
|
'lsim;': '\u2272',
|
|
|
'lsime;': '\u2a8d',
|
|
|
'lsimg;': '\u2a8f',
|
|
|
'lsqb;': '[',
|
|
|
'lsquo;': '\u2018',
|
|
|
'lsquor;': '\u201a',
|
|
|
'Lstrok;': '\u0141',
|
|
|
'lstrok;': '\u0142',
|
|
|
'LT': '<',
|
|
|
'lt': '<',
|
|
|
'LT;': '<',
|
|
|
'Lt;': '\u226a',
|
|
|
'lt;': '<',
|
|
|
'ltcc;': '\u2aa6',
|
|
|
'ltcir;': '\u2a79',
|
|
|
'ltdot;': '\u22d6',
|
|
|
'lthree;': '\u22cb',
|
|
|
'ltimes;': '\u22c9',
|
|
|
'ltlarr;': '\u2976',
|
|
|
'ltquest;': '\u2a7b',
|
|
|
'ltri;': '\u25c3',
|
|
|
'ltrie;': '\u22b4',
|
|
|
'ltrif;': '\u25c2',
|
|
|
'ltrPar;': '\u2996',
|
|
|
'lurdshar;': '\u294a',
|
|
|
'luruhar;': '\u2966',
|
|
|
'lvertneqq;': '\u2268\ufe00',
|
|
|
'lvnE;': '\u2268\ufe00',
|
|
|
'macr': '\xaf',
|
|
|
'macr;': '\xaf',
|
|
|
'male;': '\u2642',
|
|
|
'malt;': '\u2720',
|
|
|
'maltese;': '\u2720',
|
|
|
'Map;': '\u2905',
|
|
|
'map;': '\u21a6',
|
|
|
'mapsto;': '\u21a6',
|
|
|
'mapstodown;': '\u21a7',
|
|
|
'mapstoleft;': '\u21a4',
|
|
|
'mapstoup;': '\u21a5',
|
|
|
'marker;': '\u25ae',
|
|
|
'mcomma;': '\u2a29',
|
|
|
'Mcy;': '\u041c',
|
|
|
'mcy;': '\u043c',
|
|
|
'mdash;': '\u2014',
|
|
|
'mDDot;': '\u223a',
|
|
|
'measuredangle;': '\u2221',
|
|
|
'MediumSpace;': '\u205f',
|
|
|
'Mellintrf;': '\u2133',
|
|
|
'Mfr;': '\U0001d510',
|
|
|
'mfr;': '\U0001d52a',
|
|
|
'mho;': '\u2127',
|
|
|
'micro': '\xb5',
|
|
|
'micro;': '\xb5',
|
|
|
'mid;': '\u2223',
|
|
|
'midast;': '*',
|
|
|
'midcir;': '\u2af0',
|
|
|
'middot': '\xb7',
|
|
|
'middot;': '\xb7',
|
|
|
'minus;': '\u2212',
|
|
|
'minusb;': '\u229f',
|
|
|
'minusd;': '\u2238',
|
|
|
'minusdu;': '\u2a2a',
|
|
|
'MinusPlus;': '\u2213',
|
|
|
'mlcp;': '\u2adb',
|
|
|
'mldr;': '\u2026',
|
|
|
'mnplus;': '\u2213',
|
|
|
'models;': '\u22a7',
|
|
|
'Mopf;': '\U0001d544',
|
|
|
'mopf;': '\U0001d55e',
|
|
|
'mp;': '\u2213',
|
|
|
'Mscr;': '\u2133',
|
|
|
'mscr;': '\U0001d4c2',
|
|
|
'mstpos;': '\u223e',
|
|
|
'Mu;': '\u039c',
|
|
|
'mu;': '\u03bc',
|
|
|
'multimap;': '\u22b8',
|
|
|
'mumap;': '\u22b8',
|
|
|
'nabla;': '\u2207',
|
|
|
'Nacute;': '\u0143',
|
|
|
'nacute;': '\u0144',
|
|
|
'nang;': '\u2220\u20d2',
|
|
|
'nap;': '\u2249',
|
|
|
'napE;': '\u2a70\u0338',
|
|
|
'napid;': '\u224b\u0338',
|
|
|
'napos;': '\u0149',
|
|
|
'napprox;': '\u2249',
|
|
|
'natur;': '\u266e',
|
|
|
'natural;': '\u266e',
|
|
|
'naturals;': '\u2115',
|
|
|
'nbsp': '\xa0',
|
|
|
'nbsp;': '\xa0',
|
|
|
'nbump;': '\u224e\u0338',
|
|
|
'nbumpe;': '\u224f\u0338',
|
|
|
'ncap;': '\u2a43',
|
|
|
'Ncaron;': '\u0147',
|
|
|
'ncaron;': '\u0148',
|
|
|
'Ncedil;': '\u0145',
|
|
|
'ncedil;': '\u0146',
|
|
|
'ncong;': '\u2247',
|
|
|
'ncongdot;': '\u2a6d\u0338',
|
|
|
'ncup;': '\u2a42',
|
|
|
'Ncy;': '\u041d',
|
|
|
'ncy;': '\u043d',
|
|
|
'ndash;': '\u2013',
|
|
|
'ne;': '\u2260',
|
|
|
'nearhk;': '\u2924',
|
|
|
'neArr;': '\u21d7',
|
|
|
'nearr;': '\u2197',
|
|
|
'nearrow;': '\u2197',
|
|
|
'nedot;': '\u2250\u0338',
|
|
|
'NegativeMediumSpace;': '\u200b',
|
|
|
'NegativeThickSpace;': '\u200b',
|
|
|
'NegativeThinSpace;': '\u200b',
|
|
|
'NegativeVeryThinSpace;': '\u200b',
|
|
|
'nequiv;': '\u2262',
|
|
|
'nesear;': '\u2928',
|
|
|
'nesim;': '\u2242\u0338',
|
|
|
'NestedGreaterGreater;': '\u226b',
|
|
|
'NestedLessLess;': '\u226a',
|
|
|
'NewLine;': '\n',
|
|
|
'nexist;': '\u2204',
|
|
|
'nexists;': '\u2204',
|
|
|
'Nfr;': '\U0001d511',
|
|
|
'nfr;': '\U0001d52b',
|
|
|
'ngE;': '\u2267\u0338',
|
|
|
'nge;': '\u2271',
|
|
|
'ngeq;': '\u2271',
|
|
|
'ngeqq;': '\u2267\u0338',
|
|
|
'ngeqslant;': '\u2a7e\u0338',
|
|
|
'nges;': '\u2a7e\u0338',
|
|
|
'nGg;': '\u22d9\u0338',
|
|
|
'ngsim;': '\u2275',
|
|
|
'nGt;': '\u226b\u20d2',
|
|
|
'ngt;': '\u226f',
|
|
|
'ngtr;': '\u226f',
|
|
|
'nGtv;': '\u226b\u0338',
|
|
|
'nhArr;': '\u21ce',
|
|
|
'nharr;': '\u21ae',
|
|
|
'nhpar;': '\u2af2',
|
|
|
'ni;': '\u220b',
|
|
|
'nis;': '\u22fc',
|
|
|
'nisd;': '\u22fa',
|
|
|
'niv;': '\u220b',
|
|
|
'NJcy;': '\u040a',
|
|
|
'njcy;': '\u045a',
|
|
|
'nlArr;': '\u21cd',
|
|
|
'nlarr;': '\u219a',
|
|
|
'nldr;': '\u2025',
|
|
|
'nlE;': '\u2266\u0338',
|
|
|
'nle;': '\u2270',
|
|
|
'nLeftarrow;': '\u21cd',
|
|
|
'nleftarrow;': '\u219a',
|
|
|
'nLeftrightarrow;': '\u21ce',
|
|
|
'nleftrightarrow;': '\u21ae',
|
|
|
'nleq;': '\u2270',
|
|
|
'nleqq;': '\u2266\u0338',
|
|
|
'nleqslant;': '\u2a7d\u0338',
|
|
|
'nles;': '\u2a7d\u0338',
|
|
|
'nless;': '\u226e',
|
|
|
'nLl;': '\u22d8\u0338',
|
|
|
'nlsim;': '\u2274',
|
|
|
'nLt;': '\u226a\u20d2',
|
|
|
'nlt;': '\u226e',
|
|
|
'nltri;': '\u22ea',
|
|
|
'nltrie;': '\u22ec',
|
|
|
'nLtv;': '\u226a\u0338',
|
|
|
'nmid;': '\u2224',
|
|
|
'NoBreak;': '\u2060',
|
|
|
'NonBreakingSpace;': '\xa0',
|
|
|
'Nopf;': '\u2115',
|
|
|
'nopf;': '\U0001d55f',
|
|
|
'not': '\xac',
|
|
|
'Not;': '\u2aec',
|
|
|
'not;': '\xac',
|
|
|
'NotCongruent;': '\u2262',
|
|
|
'NotCupCap;': '\u226d',
|
|
|
'NotDoubleVerticalBar;': '\u2226',
|
|
|
'NotElement;': '\u2209',
|
|
|
'NotEqual;': '\u2260',
|
|
|
'NotEqualTilde;': '\u2242\u0338',
|
|
|
'NotExists;': '\u2204',
|
|
|
'NotGreater;': '\u226f',
|
|
|
'NotGreaterEqual;': '\u2271',
|
|
|
'NotGreaterFullEqual;': '\u2267\u0338',
|
|
|
'NotGreaterGreater;': '\u226b\u0338',
|
|
|
'NotGreaterLess;': '\u2279',
|
|
|
'NotGreaterSlantEqual;': '\u2a7e\u0338',
|
|
|
'NotGreaterTilde;': '\u2275',
|
|
|
'NotHumpDownHump;': '\u224e\u0338',
|
|
|
'NotHumpEqual;': '\u224f\u0338',
|
|
|
'notin;': '\u2209',
|
|
|
'notindot;': '\u22f5\u0338',
|
|
|
'notinE;': '\u22f9\u0338',
|
|
|
'notinva;': '\u2209',
|
|
|
'notinvb;': '\u22f7',
|
|
|
'notinvc;': '\u22f6',
|
|
|
'NotLeftTriangle;': '\u22ea',
|
|
|
'NotLeftTriangleBar;': '\u29cf\u0338',
|
|
|
'NotLeftTriangleEqual;': '\u22ec',
|
|
|
'NotLess;': '\u226e',
|
|
|
'NotLessEqual;': '\u2270',
|
|
|
'NotLessGreater;': '\u2278',
|
|
|
'NotLessLess;': '\u226a\u0338',
|
|
|
'NotLessSlantEqual;': '\u2a7d\u0338',
|
|
|
'NotLessTilde;': '\u2274',
|
|
|
'NotNestedGreaterGreater;': '\u2aa2\u0338',
|
|
|
'NotNestedLessLess;': '\u2aa1\u0338',
|
|
|
'notni;': '\u220c',
|
|
|
'notniva;': '\u220c',
|
|
|
'notnivb;': '\u22fe',
|
|
|
'notnivc;': '\u22fd',
|
|
|
'NotPrecedes;': '\u2280',
|
|
|
'NotPrecedesEqual;': '\u2aaf\u0338',
|
|
|
'NotPrecedesSlantEqual;': '\u22e0',
|
|
|
'NotReverseElement;': '\u220c',
|
|
|
'NotRightTriangle;': '\u22eb',
|
|
|
'NotRightTriangleBar;': '\u29d0\u0338',
|
|
|
'NotRightTriangleEqual;': '\u22ed',
|
|
|
'NotSquareSubset;': '\u228f\u0338',
|
|
|
'NotSquareSubsetEqual;': '\u22e2',
|
|
|
'NotSquareSuperset;': '\u2290\u0338',
|
|
|
'NotSquareSupersetEqual;': '\u22e3',
|
|
|
'NotSubset;': '\u2282\u20d2',
|
|
|
'NotSubsetEqual;': '\u2288',
|
|
|
'NotSucceeds;': '\u2281',
|
|
|
'NotSucceedsEqual;': '\u2ab0\u0338',
|
|
|
'NotSucceedsSlantEqual;': '\u22e1',
|
|
|
'NotSucceedsTilde;': '\u227f\u0338',
|
|
|
'NotSuperset;': '\u2283\u20d2',
|
|
|
'NotSupersetEqual;': '\u2289',
|
|
|
'NotTilde;': '\u2241',
|
|
|
'NotTildeEqual;': '\u2244',
|
|
|
'NotTildeFullEqual;': '\u2247',
|
|
|
'NotTildeTilde;': '\u2249',
|
|
|
'NotVerticalBar;': '\u2224',
|
|
|
'npar;': '\u2226',
|
|
|
'nparallel;': '\u2226',
|
|
|
'nparsl;': '\u2afd\u20e5',
|
|
|
'npart;': '\u2202\u0338',
|
|
|
'npolint;': '\u2a14',
|
|
|
'npr;': '\u2280',
|
|
|
'nprcue;': '\u22e0',
|
|
|
'npre;': '\u2aaf\u0338',
|
|
|
'nprec;': '\u2280',
|
|
|
'npreceq;': '\u2aaf\u0338',
|
|
|
'nrArr;': '\u21cf',
|
|
|
'nrarr;': '\u219b',
|
|
|
'nrarrc;': '\u2933\u0338',
|
|
|
'nrarrw;': '\u219d\u0338',
|
|
|
'nRightarrow;': '\u21cf',
|
|
|
'nrightarrow;': '\u219b',
|
|
|
'nrtri;': '\u22eb',
|
|
|
'nrtrie;': '\u22ed',
|
|
|
'nsc;': '\u2281',
|
|
|
'nsccue;': '\u22e1',
|
|
|
'nsce;': '\u2ab0\u0338',
|
|
|
'Nscr;': '\U0001d4a9',
|
|
|
'nscr;': '\U0001d4c3',
|
|
|
'nshortmid;': '\u2224',
|
|
|
'nshortparallel;': '\u2226',
|
|
|
'nsim;': '\u2241',
|
|
|
'nsime;': '\u2244',
|
|
|
'nsimeq;': '\u2244',
|
|
|
'nsmid;': '\u2224',
|
|
|
'nspar;': '\u2226',
|
|
|
'nsqsube;': '\u22e2',
|
|
|
'nsqsupe;': '\u22e3',
|
|
|
'nsub;': '\u2284',
|
|
|
'nsubE;': '\u2ac5\u0338',
|
|
|
'nsube;': '\u2288',
|
|
|
'nsubset;': '\u2282\u20d2',
|
|
|
'nsubseteq;': '\u2288',
|
|
|
'nsubseteqq;': '\u2ac5\u0338',
|
|
|
'nsucc;': '\u2281',
|
|
|
'nsucceq;': '\u2ab0\u0338',
|
|
|
'nsup;': '\u2285',
|
|
|
'nsupE;': '\u2ac6\u0338',
|
|
|
'nsupe;': '\u2289',
|
|
|
'nsupset;': '\u2283\u20d2',
|
|
|
'nsupseteq;': '\u2289',
|
|
|
'nsupseteqq;': '\u2ac6\u0338',
|
|
|
'ntgl;': '\u2279',
|
|
|
'Ntilde': '\xd1',
|
|
|
'ntilde': '\xf1',
|
|
|
'Ntilde;': '\xd1',
|
|
|
'ntilde;': '\xf1',
|
|
|
'ntlg;': '\u2278',
|
|
|
'ntriangleleft;': '\u22ea',
|
|
|
'ntrianglelefteq;': '\u22ec',
|
|
|
'ntriangleright;': '\u22eb',
|
|
|
'ntrianglerighteq;': '\u22ed',
|
|
|
'Nu;': '\u039d',
|
|
|
'nu;': '\u03bd',
|
|
|
'num;': '#',
|
|
|
'numero;': '\u2116',
|
|
|
'numsp;': '\u2007',
|
|
|
'nvap;': '\u224d\u20d2',
|
|
|
'nVDash;': '\u22af',
|
|
|
'nVdash;': '\u22ae',
|
|
|
'nvDash;': '\u22ad',
|
|
|
'nvdash;': '\u22ac',
|
|
|
'nvge;': '\u2265\u20d2',
|
|
|
'nvgt;': '>\u20d2',
|
|
|
'nvHarr;': '\u2904',
|
|
|
'nvinfin;': '\u29de',
|
|
|
'nvlArr;': '\u2902',
|
|
|
'nvle;': '\u2264\u20d2',
|
|
|
'nvlt;': '<\u20d2',
|
|
|
'nvltrie;': '\u22b4\u20d2',
|
|
|
'nvrArr;': '\u2903',
|
|
|
'nvrtrie;': '\u22b5\u20d2',
|
|
|
'nvsim;': '\u223c\u20d2',
|
|
|
'nwarhk;': '\u2923',
|
|
|
'nwArr;': '\u21d6',
|
|
|
'nwarr;': '\u2196',
|
|
|
'nwarrow;': '\u2196',
|
|
|
'nwnear;': '\u2927',
|
|
|
'Oacute': '\xd3',
|
|
|
'oacute': '\xf3',
|
|
|
'Oacute;': '\xd3',
|
|
|
'oacute;': '\xf3',
|
|
|
'oast;': '\u229b',
|
|
|
'ocir;': '\u229a',
|
|
|
'Ocirc': '\xd4',
|
|
|
'ocirc': '\xf4',
|
|
|
'Ocirc;': '\xd4',
|
|
|
'ocirc;': '\xf4',
|
|
|
'Ocy;': '\u041e',
|
|
|
'ocy;': '\u043e',
|
|
|
'odash;': '\u229d',
|
|
|
'Odblac;': '\u0150',
|
|
|
'odblac;': '\u0151',
|
|
|
'odiv;': '\u2a38',
|
|
|
'odot;': '\u2299',
|
|
|
'odsold;': '\u29bc',
|
|
|
'OElig;': '\u0152',
|
|
|
'oelig;': '\u0153',
|
|
|
'ofcir;': '\u29bf',
|
|
|
'Ofr;': '\U0001d512',
|
|
|
'ofr;': '\U0001d52c',
|
|
|
'ogon;': '\u02db',
|
|
|
'Ograve': '\xd2',
|
|
|
'ograve': '\xf2',
|
|
|
'Ograve;': '\xd2',
|
|
|
'ograve;': '\xf2',
|
|
|
'ogt;': '\u29c1',
|
|
|
'ohbar;': '\u29b5',
|
|
|
'ohm;': '\u03a9',
|
|
|
'oint;': '\u222e',
|
|
|
'olarr;': '\u21ba',
|
|
|
'olcir;': '\u29be',
|
|
|
'olcross;': '\u29bb',
|
|
|
'oline;': '\u203e',
|
|
|
'olt;': '\u29c0',
|
|
|
'Omacr;': '\u014c',
|
|
|
'omacr;': '\u014d',
|
|
|
'Omega;': '\u03a9',
|
|
|
'omega;': '\u03c9',
|
|
|
'Omicron;': '\u039f',
|
|
|
'omicron;': '\u03bf',
|
|
|
'omid;': '\u29b6',
|
|
|
'ominus;': '\u2296',
|
|
|
'Oopf;': '\U0001d546',
|
|
|
'oopf;': '\U0001d560',
|
|
|
'opar;': '\u29b7',
|
|
|
'OpenCurlyDoubleQuote;': '\u201c',
|
|
|
'OpenCurlyQuote;': '\u2018',
|
|
|
'operp;': '\u29b9',
|
|
|
'oplus;': '\u2295',
|
|
|
'Or;': '\u2a54',
|
|
|
'or;': '\u2228',
|
|
|
'orarr;': '\u21bb',
|
|
|
'ord;': '\u2a5d',
|
|
|
'order;': '\u2134',
|
|
|
'orderof;': '\u2134',
|
|
|
'ordf': '\xaa',
|
|
|
'ordf;': '\xaa',
|
|
|
'ordm': '\xba',
|
|
|
'ordm;': '\xba',
|
|
|
'origof;': '\u22b6',
|
|
|
'oror;': '\u2a56',
|
|
|
'orslope;': '\u2a57',
|
|
|
'orv;': '\u2a5b',
|
|
|
'oS;': '\u24c8',
|
|
|
'Oscr;': '\U0001d4aa',
|
|
|
'oscr;': '\u2134',
|
|
|
'Oslash': '\xd8',
|
|
|
'oslash': '\xf8',
|
|
|
'Oslash;': '\xd8',
|
|
|
'oslash;': '\xf8',
|
|
|
'osol;': '\u2298',
|
|
|
'Otilde': '\xd5',
|
|
|
'otilde': '\xf5',
|
|
|
'Otilde;': '\xd5',
|
|
|
'otilde;': '\xf5',
|
|
|
'Otimes;': '\u2a37',
|
|
|
'otimes;': '\u2297',
|
|
|
'otimesas;': '\u2a36',
|
|
|
'Ouml': '\xd6',
|
|
|
'ouml': '\xf6',
|
|
|
'Ouml;': '\xd6',
|
|
|
'ouml;': '\xf6',
|
|
|
'ovbar;': '\u233d',
|
|
|
'OverBar;': '\u203e',
|
|
|
'OverBrace;': '\u23de',
|
|
|
'OverBracket;': '\u23b4',
|
|
|
'OverParenthesis;': '\u23dc',
|
|
|
'par;': '\u2225',
|
|
|
'para': '\xb6',
|
|
|
'para;': '\xb6',
|
|
|
'parallel;': '\u2225',
|
|
|
'parsim;': '\u2af3',
|
|
|
'parsl;': '\u2afd',
|
|
|
'part;': '\u2202',
|
|
|
'PartialD;': '\u2202',
|
|
|
'Pcy;': '\u041f',
|
|
|
'pcy;': '\u043f',
|
|
|
'percnt;': '%',
|
|
|
'period;': '.',
|
|
|
'permil;': '\u2030',
|
|
|
'perp;': '\u22a5',
|
|
|
'pertenk;': '\u2031',
|
|
|
'Pfr;': '\U0001d513',
|
|
|
'pfr;': '\U0001d52d',
|
|
|
'Phi;': '\u03a6',
|
|
|
'phi;': '\u03c6',
|
|
|
'phiv;': '\u03d5',
|
|
|
'phmmat;': '\u2133',
|
|
|
'phone;': '\u260e',
|
|
|
'Pi;': '\u03a0',
|
|
|
'pi;': '\u03c0',
|
|
|
'pitchfork;': '\u22d4',
|
|
|
'piv;': '\u03d6',
|
|
|
'planck;': '\u210f',
|
|
|
'planckh;': '\u210e',
|
|
|
'plankv;': '\u210f',
|
|
|
'plus;': '+',
|
|
|
'plusacir;': '\u2a23',
|
|
|
'plusb;': '\u229e',
|
|
|
'pluscir;': '\u2a22',
|
|
|
'plusdo;': '\u2214',
|
|
|
'plusdu;': '\u2a25',
|
|
|
'pluse;': '\u2a72',
|
|
|
'PlusMinus;': '\xb1',
|
|
|
'plusmn': '\xb1',
|
|
|
'plusmn;': '\xb1',
|
|
|
'plussim;': '\u2a26',
|
|
|
'plustwo;': '\u2a27',
|
|
|
'pm;': '\xb1',
|
|
|
'Poincareplane;': '\u210c',
|
|
|
'pointint;': '\u2a15',
|
|
|
'Popf;': '\u2119',
|
|
|
'popf;': '\U0001d561',
|
|
|
'pound': '\xa3',
|
|
|
'pound;': '\xa3',
|
|
|
'Pr;': '\u2abb',
|
|
|
'pr;': '\u227a',
|
|
|
'prap;': '\u2ab7',
|
|
|
'prcue;': '\u227c',
|
|
|
'prE;': '\u2ab3',
|
|
|
'pre;': '\u2aaf',
|
|
|
'prec;': '\u227a',
|
|
|
'precapprox;': '\u2ab7',
|
|
|
'preccurlyeq;': '\u227c',
|
|
|
'Precedes;': '\u227a',
|
|
|
'PrecedesEqual;': '\u2aaf',
|
|
|
'PrecedesSlantEqual;': '\u227c',
|
|
|
'PrecedesTilde;': '\u227e',
|
|
|
'preceq;': '\u2aaf',
|
|
|
'precnapprox;': '\u2ab9',
|
|
|
'precneqq;': '\u2ab5',
|
|
|
'precnsim;': '\u22e8',
|
|
|
'precsim;': '\u227e',
|
|
|
'Prime;': '\u2033',
|
|
|
'prime;': '\u2032',
|
|
|
'primes;': '\u2119',
|
|
|
'prnap;': '\u2ab9',
|
|
|
'prnE;': '\u2ab5',
|
|
|
'prnsim;': '\u22e8',
|
|
|
'prod;': '\u220f',
|
|
|
'Product;': '\u220f',
|
|
|
'profalar;': '\u232e',
|
|
|
'profline;': '\u2312',
|
|
|
'profsurf;': '\u2313',
|
|
|
'prop;': '\u221d',
|
|
|
'Proportion;': '\u2237',
|
|
|
'Proportional;': '\u221d',
|
|
|
'propto;': '\u221d',
|
|
|
'prsim;': '\u227e',
|
|
|
'prurel;': '\u22b0',
|
|
|
'Pscr;': '\U0001d4ab',
|
|
|
'pscr;': '\U0001d4c5',
|
|
|
'Psi;': '\u03a8',
|
|
|
'psi;': '\u03c8',
|
|
|
'puncsp;': '\u2008',
|
|
|
'Qfr;': '\U0001d514',
|
|
|
'qfr;': '\U0001d52e',
|
|
|
'qint;': '\u2a0c',
|
|
|
'Qopf;': '\u211a',
|
|
|
'qopf;': '\U0001d562',
|
|
|
'qprime;': '\u2057',
|
|
|
'Qscr;': '\U0001d4ac',
|
|
|
'qscr;': '\U0001d4c6',
|
|
|
'quaternions;': '\u210d',
|
|
|
'quatint;': '\u2a16',
|
|
|
'quest;': '?',
|
|
|
'questeq;': '\u225f',
|
|
|
'QUOT': '"',
|
|
|
'quot': '"',
|
|
|
'QUOT;': '"',
|
|
|
'quot;': '"',
|
|
|
'rAarr;': '\u21db',
|
|
|
'race;': '\u223d\u0331',
|
|
|
'Racute;': '\u0154',
|
|
|
'racute;': '\u0155',
|
|
|
'radic;': '\u221a',
|
|
|
'raemptyv;': '\u29b3',
|
|
|
'Rang;': '\u27eb',
|
|
|
'rang;': '\u27e9',
|
|
|
'rangd;': '\u2992',
|
|
|
'range;': '\u29a5',
|
|
|
'rangle;': '\u27e9',
|
|
|
'raquo': '\xbb',
|
|
|
'raquo;': '\xbb',
|
|
|
'Rarr;': '\u21a0',
|
|
|
'rArr;': '\u21d2',
|
|
|
'rarr;': '\u2192',
|
|
|
'rarrap;': '\u2975',
|
|
|
'rarrb;': '\u21e5',
|
|
|
'rarrbfs;': '\u2920',
|
|
|
'rarrc;': '\u2933',
|
|
|
'rarrfs;': '\u291e',
|
|
|
'rarrhk;': '\u21aa',
|
|
|
'rarrlp;': '\u21ac',
|
|
|
'rarrpl;': '\u2945',
|
|
|
'rarrsim;': '\u2974',
|
|
|
'Rarrtl;': '\u2916',
|
|
|
'rarrtl;': '\u21a3',
|
|
|
'rarrw;': '\u219d',
|
|
|
'rAtail;': '\u291c',
|
|
|
'ratail;': '\u291a',
|
|
|
'ratio;': '\u2236',
|
|
|
'rationals;': '\u211a',
|
|
|
'RBarr;': '\u2910',
|
|
|
'rBarr;': '\u290f',
|
|
|
'rbarr;': '\u290d',
|
|
|
'rbbrk;': '\u2773',
|
|
|
'rbrace;': '}',
|
|
|
'rbrack;': ']',
|
|
|
'rbrke;': '\u298c',
|
|
|
'rbrksld;': '\u298e',
|
|
|
'rbrkslu;': '\u2990',
|
|
|
'Rcaron;': '\u0158',
|
|
|
'rcaron;': '\u0159',
|
|
|
'Rcedil;': '\u0156',
|
|
|
'rcedil;': '\u0157',
|
|
|
'rceil;': '\u2309',
|
|
|
'rcub;': '}',
|
|
|
'Rcy;': '\u0420',
|
|
|
'rcy;': '\u0440',
|
|
|
'rdca;': '\u2937',
|
|
|
'rdldhar;': '\u2969',
|
|
|
'rdquo;': '\u201d',
|
|
|
'rdquor;': '\u201d',
|
|
|
'rdsh;': '\u21b3',
|
|
|
'Re;': '\u211c',
|
|
|
'real;': '\u211c',
|
|
|
'realine;': '\u211b',
|
|
|
'realpart;': '\u211c',
|
|
|
'reals;': '\u211d',
|
|
|
'rect;': '\u25ad',
|
|
|
'REG': '\xae',
|
|
|
'reg': '\xae',
|
|
|
'REG;': '\xae',
|
|
|
'reg;': '\xae',
|
|
|
'ReverseElement;': '\u220b',
|
|
|
'ReverseEquilibrium;': '\u21cb',
|
|
|
'ReverseUpEquilibrium;': '\u296f',
|
|
|
'rfisht;': '\u297d',
|
|
|
'rfloor;': '\u230b',
|
|
|
'Rfr;': '\u211c',
|
|
|
'rfr;': '\U0001d52f',
|
|
|
'rHar;': '\u2964',
|
|
|
'rhard;': '\u21c1',
|
|
|
'rharu;': '\u21c0',
|
|
|
'rharul;': '\u296c',
|
|
|
'Rho;': '\u03a1',
|
|
|
'rho;': '\u03c1',
|
|
|
'rhov;': '\u03f1',
|
|
|
'RightAngleBracket;': '\u27e9',
|
|
|
'RightArrow;': '\u2192',
|
|
|
'Rightarrow;': '\u21d2',
|
|
|
'rightarrow;': '\u2192',
|
|
|
'RightArrowBar;': '\u21e5',
|
|
|
'RightArrowLeftArrow;': '\u21c4',
|
|
|
'rightarrowtail;': '\u21a3',
|
|
|
'RightCeiling;': '\u2309',
|
|
|
'RightDoubleBracket;': '\u27e7',
|
|
|
'RightDownTeeVector;': '\u295d',
|
|
|
'RightDownVector;': '\u21c2',
|
|
|
'RightDownVectorBar;': '\u2955',
|
|
|
'RightFloor;': '\u230b',
|
|
|
'rightharpoondown;': '\u21c1',
|
|
|
'rightharpoonup;': '\u21c0',
|
|
|
'rightleftarrows;': '\u21c4',
|
|
|
'rightleftharpoons;': '\u21cc',
|
|
|
'rightrightarrows;': '\u21c9',
|
|
|
'rightsquigarrow;': '\u219d',
|
|
|
'RightTee;': '\u22a2',
|
|
|
'RightTeeArrow;': '\u21a6',
|
|
|
'RightTeeVector;': '\u295b',
|
|
|
'rightthreetimes;': '\u22cc',
|
|
|
'RightTriangle;': '\u22b3',
|
|
|
'RightTriangleBar;': '\u29d0',
|
|
|
'RightTriangleEqual;': '\u22b5',
|
|
|
'RightUpDownVector;': '\u294f',
|
|
|
'RightUpTeeVector;': '\u295c',
|
|
|
'RightUpVector;': '\u21be',
|
|
|
'RightUpVectorBar;': '\u2954',
|
|
|
'RightVector;': '\u21c0',
|
|
|
'RightVectorBar;': '\u2953',
|
|
|
'ring;': '\u02da',
|
|
|
'risingdotseq;': '\u2253',
|
|
|
'rlarr;': '\u21c4',
|
|
|
'rlhar;': '\u21cc',
|
|
|
'rlm;': '\u200f',
|
|
|
'rmoust;': '\u23b1',
|
|
|
'rmoustache;': '\u23b1',
|
|
|
'rnmid;': '\u2aee',
|
|
|
'roang;': '\u27ed',
|
|
|
'roarr;': '\u21fe',
|
|
|
'robrk;': '\u27e7',
|
|
|
'ropar;': '\u2986',
|
|
|
'Ropf;': '\u211d',
|
|
|
'ropf;': '\U0001d563',
|
|
|
'roplus;': '\u2a2e',
|
|
|
'rotimes;': '\u2a35',
|
|
|
'RoundImplies;': '\u2970',
|
|
|
'rpar;': ')',
|
|
|
'rpargt;': '\u2994',
|
|
|
'rppolint;': '\u2a12',
|
|
|
'rrarr;': '\u21c9',
|
|
|
'Rrightarrow;': '\u21db',
|
|
|
'rsaquo;': '\u203a',
|
|
|
'Rscr;': '\u211b',
|
|
|
'rscr;': '\U0001d4c7',
|
|
|
'Rsh;': '\u21b1',
|
|
|
'rsh;': '\u21b1',
|
|
|
'rsqb;': ']',
|
|
|
'rsquo;': '\u2019',
|
|
|
'rsquor;': '\u2019',
|
|
|
'rthree;': '\u22cc',
|
|
|
'rtimes;': '\u22ca',
|
|
|
'rtri;': '\u25b9',
|
|
|
'rtrie;': '\u22b5',
|
|
|
'rtrif;': '\u25b8',
|
|
|
'rtriltri;': '\u29ce',
|
|
|
'RuleDelayed;': '\u29f4',
|
|
|
'ruluhar;': '\u2968',
|
|
|
'rx;': '\u211e',
|
|
|
'Sacute;': '\u015a',
|
|
|
'sacute;': '\u015b',
|
|
|
'sbquo;': '\u201a',
|
|
|
'Sc;': '\u2abc',
|
|
|
'sc;': '\u227b',
|
|
|
'scap;': '\u2ab8',
|
|
|
'Scaron;': '\u0160',
|
|
|
'scaron;': '\u0161',
|
|
|
'sccue;': '\u227d',
|
|
|
'scE;': '\u2ab4',
|
|
|
'sce;': '\u2ab0',
|
|
|
'Scedil;': '\u015e',
|
|
|
'scedil;': '\u015f',
|
|
|
'Scirc;': '\u015c',
|
|
|
'scirc;': '\u015d',
|
|
|
'scnap;': '\u2aba',
|
|
|
'scnE;': '\u2ab6',
|
|
|
'scnsim;': '\u22e9',
|
|
|
'scpolint;': '\u2a13',
|
|
|
'scsim;': '\u227f',
|
|
|
'Scy;': '\u0421',
|
|
|
'scy;': '\u0441',
|
|
|
'sdot;': '\u22c5',
|
|
|
'sdotb;': '\u22a1',
|
|
|
'sdote;': '\u2a66',
|
|
|
'searhk;': '\u2925',
|
|
|
'seArr;': '\u21d8',
|
|
|
'searr;': '\u2198',
|
|
|
'searrow;': '\u2198',
|
|
|
'sect': '\xa7',
|
|
|
'sect;': '\xa7',
|
|
|
'semi;': ';',
|
|
|
'seswar;': '\u2929',
|
|
|
'setminus;': '\u2216',
|
|
|
'setmn;': '\u2216',
|
|
|
'sext;': '\u2736',
|
|
|
'Sfr;': '\U0001d516',
|
|
|
'sfr;': '\U0001d530',
|
|
|
'sfrown;': '\u2322',
|
|
|
'sharp;': '\u266f',
|
|
|
'SHCHcy;': '\u0429',
|
|
|
'shchcy;': '\u0449',
|
|
|
'SHcy;': '\u0428',
|
|
|
'shcy;': '\u0448',
|
|
|
'ShortDownArrow;': '\u2193',
|
|
|
'ShortLeftArrow;': '\u2190',
|
|
|
'shortmid;': '\u2223',
|
|
|
'shortparallel;': '\u2225',
|
|
|
'ShortRightArrow;': '\u2192',
|
|
|
'ShortUpArrow;': '\u2191',
|
|
|
'shy': '\xad',
|
|
|
'shy;': '\xad',
|
|
|
'Sigma;': '\u03a3',
|
|
|
'sigma;': '\u03c3',
|
|
|
'sigmaf;': '\u03c2',
|
|
|
'sigmav;': '\u03c2',
|
|
|
'sim;': '\u223c',
|
|
|
'simdot;': '\u2a6a',
|
|
|
'sime;': '\u2243',
|
|
|
'simeq;': '\u2243',
|
|
|
'simg;': '\u2a9e',
|
|
|
'simgE;': '\u2aa0',
|
|
|
'siml;': '\u2a9d',
|
|
|
'simlE;': '\u2a9f',
|
|
|
'simne;': '\u2246',
|
|
|
'simplus;': '\u2a24',
|
|
|
'simrarr;': '\u2972',
|
|
|
'slarr;': '\u2190',
|
|
|
'SmallCircle;': '\u2218',
|
|
|
'smallsetminus;': '\u2216',
|
|
|
'smashp;': '\u2a33',
|
|
|
'smeparsl;': '\u29e4',
|
|
|
'smid;': '\u2223',
|
|
|
'smile;': '\u2323',
|
|
|
'smt;': '\u2aaa',
|
|
|
'smte;': '\u2aac',
|
|
|
'smtes;': '\u2aac\ufe00',
|
|
|
'SOFTcy;': '\u042c',
|
|
|
'softcy;': '\u044c',
|
|
|
'sol;': '/',
|
|
|
'solb;': '\u29c4',
|
|
|
'solbar;': '\u233f',
|
|
|
'Sopf;': '\U0001d54a',
|
|
|
'sopf;': '\U0001d564',
|
|
|
'spades;': '\u2660',
|
|
|
'spadesuit;': '\u2660',
|
|
|
'spar;': '\u2225',
|
|
|
'sqcap;': '\u2293',
|
|
|
'sqcaps;': '\u2293\ufe00',
|
|
|
'sqcup;': '\u2294',
|
|
|
'sqcups;': '\u2294\ufe00',
|
|
|
'Sqrt;': '\u221a',
|
|
|
'sqsub;': '\u228f',
|
|
|
'sqsube;': '\u2291',
|
|
|
'sqsubset;': '\u228f',
|
|
|
'sqsubseteq;': '\u2291',
|
|
|
'sqsup;': '\u2290',
|
|
|
'sqsupe;': '\u2292',
|
|
|
'sqsupset;': '\u2290',
|
|
|
'sqsupseteq;': '\u2292',
|
|
|
'squ;': '\u25a1',
|
|
|
'Square;': '\u25a1',
|
|
|
'square;': '\u25a1',
|
|
|
'SquareIntersection;': '\u2293',
|
|
|
'SquareSubset;': '\u228f',
|
|
|
'SquareSubsetEqual;': '\u2291',
|
|
|
'SquareSuperset;': '\u2290',
|
|
|
'SquareSupersetEqual;': '\u2292',
|
|
|
'SquareUnion;': '\u2294',
|
|
|
'squarf;': '\u25aa',
|
|
|
'squf;': '\u25aa',
|
|
|
'srarr;': '\u2192',
|
|
|
'Sscr;': '\U0001d4ae',
|
|
|
'sscr;': '\U0001d4c8',
|
|
|
'ssetmn;': '\u2216',
|
|
|
'ssmile;': '\u2323',
|
|
|
'sstarf;': '\u22c6',
|
|
|
'Star;': '\u22c6',
|
|
|
'star;': '\u2606',
|
|
|
'starf;': '\u2605',
|
|
|
'straightepsilon;': '\u03f5',
|
|
|
'straightphi;': '\u03d5',
|
|
|
'strns;': '\xaf',
|
|
|
'Sub;': '\u22d0',
|
|
|
'sub;': '\u2282',
|
|
|
'subdot;': '\u2abd',
|
|
|
'subE;': '\u2ac5',
|
|
|
'sube;': '\u2286',
|
|
|
'subedot;': '\u2ac3',
|
|
|
'submult;': '\u2ac1',
|
|
|
'subnE;': '\u2acb',
|
|
|
'subne;': '\u228a',
|
|
|
'subplus;': '\u2abf',
|
|
|
'subrarr;': '\u2979',
|
|
|
'Subset;': '\u22d0',
|
|
|
'subset;': '\u2282',
|
|
|
'subseteq;': '\u2286',
|
|
|
'subseteqq;': '\u2ac5',
|
|
|
'SubsetEqual;': '\u2286',
|
|
|
'subsetneq;': '\u228a',
|
|
|
'subsetneqq;': '\u2acb',
|
|
|
'subsim;': '\u2ac7',
|
|
|
'subsub;': '\u2ad5',
|
|
|
'subsup;': '\u2ad3',
|
|
|
'succ;': '\u227b',
|
|
|
'succapprox;': '\u2ab8',
|
|
|
'succcurlyeq;': '\u227d',
|
|
|
'Succeeds;': '\u227b',
|
|
|
'SucceedsEqual;': '\u2ab0',
|
|
|
'SucceedsSlantEqual;': '\u227d',
|
|
|
'SucceedsTilde;': '\u227f',
|
|
|
'succeq;': '\u2ab0',
|
|
|
'succnapprox;': '\u2aba',
|
|
|
'succneqq;': '\u2ab6',
|
|
|
'succnsim;': '\u22e9',
|
|
|
'succsim;': '\u227f',
|
|
|
'SuchThat;': '\u220b',
|
|
|
'Sum;': '\u2211',
|
|
|
'sum;': '\u2211',
|
|
|
'sung;': '\u266a',
|
|
|
'sup1': '\xb9',
|
|
|
'sup1;': '\xb9',
|
|
|
'sup2': '\xb2',
|
|
|
'sup2;': '\xb2',
|
|
|
'sup3': '\xb3',
|
|
|
'sup3;': '\xb3',
|
|
|
'Sup;': '\u22d1',
|
|
|
'sup;': '\u2283',
|
|
|
'supdot;': '\u2abe',
|
|
|
'supdsub;': '\u2ad8',
|
|
|
'supE;': '\u2ac6',
|
|
|
'supe;': '\u2287',
|
|
|
'supedot;': '\u2ac4',
|
|
|
'Superset;': '\u2283',
|
|
|
'SupersetEqual;': '\u2287',
|
|
|
'suphsol;': '\u27c9',
|
|
|
'suphsub;': '\u2ad7',
|
|
|
'suplarr;': '\u297b',
|
|
|
'supmult;': '\u2ac2',
|
|
|
'supnE;': '\u2acc',
|
|
|
'supne;': '\u228b',
|
|
|
'supplus;': '\u2ac0',
|
|
|
'Supset;': '\u22d1',
|
|
|
'supset;': '\u2283',
|
|
|
'supseteq;': '\u2287',
|
|
|
'supseteqq;': '\u2ac6',
|
|
|
'supsetneq;': '\u228b',
|
|
|
'supsetneqq;': '\u2acc',
|
|
|
'supsim;': '\u2ac8',
|
|
|
'supsub;': '\u2ad4',
|
|
|
'supsup;': '\u2ad6',
|
|
|
'swarhk;': '\u2926',
|
|
|
'swArr;': '\u21d9',
|
|
|
'swarr;': '\u2199',
|
|
|
'swarrow;': '\u2199',
|
|
|
'swnwar;': '\u292a',
|
|
|
'szlig': '\xdf',
|
|
|
'szlig;': '\xdf',
|
|
|
'Tab;': '\t',
|
|
|
'target;': '\u2316',
|
|
|
'Tau;': '\u03a4',
|
|
|
'tau;': '\u03c4',
|
|
|
'tbrk;': '\u23b4',
|
|
|
'Tcaron;': '\u0164',
|
|
|
'tcaron;': '\u0165',
|
|
|
'Tcedil;': '\u0162',
|
|
|
'tcedil;': '\u0163',
|
|
|
'Tcy;': '\u0422',
|
|
|
'tcy;': '\u0442',
|
|
|
'tdot;': '\u20db',
|
|
|
'telrec;': '\u2315',
|
|
|
'Tfr;': '\U0001d517',
|
|
|
'tfr;': '\U0001d531',
|
|
|
'there4;': '\u2234',
|
|
|
'Therefore;': '\u2234',
|
|
|
'therefore;': '\u2234',
|
|
|
'Theta;': '\u0398',
|
|
|
'theta;': '\u03b8',
|
|
|
'thetasym;': '\u03d1',
|
|
|
'thetav;': '\u03d1',
|
|
|
'thickapprox;': '\u2248',
|
|
|
'thicksim;': '\u223c',
|
|
|
'ThickSpace;': '\u205f\u200a',
|
|
|
'thinsp;': '\u2009',
|
|
|
'ThinSpace;': '\u2009',
|
|
|
'thkap;': '\u2248',
|
|
|
'thksim;': '\u223c',
|
|
|
'THORN': '\xde',
|
|
|
'thorn': '\xfe',
|
|
|
'THORN;': '\xde',
|
|
|
'thorn;': '\xfe',
|
|
|
'Tilde;': '\u223c',
|
|
|
'tilde;': '\u02dc',
|
|
|
'TildeEqual;': '\u2243',
|
|
|
'TildeFullEqual;': '\u2245',
|
|
|
'TildeTilde;': '\u2248',
|
|
|
'times': '\xd7',
|
|
|
'times;': '\xd7',
|
|
|
'timesb;': '\u22a0',
|
|
|
'timesbar;': '\u2a31',
|
|
|
'timesd;': '\u2a30',
|
|
|
'tint;': '\u222d',
|
|
|
'toea;': '\u2928',
|
|
|
'top;': '\u22a4',
|
|
|
'topbot;': '\u2336',
|
|
|
'topcir;': '\u2af1',
|
|
|
'Topf;': '\U0001d54b',
|
|
|
'topf;': '\U0001d565',
|
|
|
'topfork;': '\u2ada',
|
|
|
'tosa;': '\u2929',
|
|
|
'tprime;': '\u2034',
|
|
|
'TRADE;': '\u2122',
|
|
|
'trade;': '\u2122',
|
|
|
'triangle;': '\u25b5',
|
|
|
'triangledown;': '\u25bf',
|
|
|
'triangleleft;': '\u25c3',
|
|
|
'trianglelefteq;': '\u22b4',
|
|
|
'triangleq;': '\u225c',
|
|
|
'triangleright;': '\u25b9',
|
|
|
'trianglerighteq;': '\u22b5',
|
|
|
'tridot;': '\u25ec',
|
|
|
'trie;': '\u225c',
|
|
|
'triminus;': '\u2a3a',
|
|
|
'TripleDot;': '\u20db',
|
|
|
'triplus;': '\u2a39',
|
|
|
'trisb;': '\u29cd',
|
|
|
'tritime;': '\u2a3b',
|
|
|
'trpezium;': '\u23e2',
|
|
|
'Tscr;': '\U0001d4af',
|
|
|
'tscr;': '\U0001d4c9',
|
|
|
'TScy;': '\u0426',
|
|
|
'tscy;': '\u0446',
|
|
|
'TSHcy;': '\u040b',
|
|
|
'tshcy;': '\u045b',
|
|
|
'Tstrok;': '\u0166',
|
|
|
'tstrok;': '\u0167',
|
|
|
'twixt;': '\u226c',
|
|
|
'twoheadleftarrow;': '\u219e',
|
|
|
'twoheadrightarrow;': '\u21a0',
|
|
|
'Uacute': '\xda',
|
|
|
'uacute': '\xfa',
|
|
|
'Uacute;': '\xda',
|
|
|
'uacute;': '\xfa',
|
|
|
'Uarr;': '\u219f',
|
|
|
'uArr;': '\u21d1',
|
|
|
'uarr;': '\u2191',
|
|
|
'Uarrocir;': '\u2949',
|
|
|
'Ubrcy;': '\u040e',
|
|
|
'ubrcy;': '\u045e',
|
|
|
'Ubreve;': '\u016c',
|
|
|
'ubreve;': '\u016d',
|
|
|
'Ucirc': '\xdb',
|
|
|
'ucirc': '\xfb',
|
|
|
'Ucirc;': '\xdb',
|
|
|
'ucirc;': '\xfb',
|
|
|
'Ucy;': '\u0423',
|
|
|
'ucy;': '\u0443',
|
|
|
'udarr;': '\u21c5',
|
|
|
'Udblac;': '\u0170',
|
|
|
'udblac;': '\u0171',
|
|
|
'udhar;': '\u296e',
|
|
|
'ufisht;': '\u297e',
|
|
|
'Ufr;': '\U0001d518',
|
|
|
'ufr;': '\U0001d532',
|
|
|
'Ugrave': '\xd9',
|
|
|
'ugrave': '\xf9',
|
|
|
'Ugrave;': '\xd9',
|
|
|
'ugrave;': '\xf9',
|
|
|
'uHar;': '\u2963',
|
|
|
'uharl;': '\u21bf',
|
|
|
'uharr;': '\u21be',
|
|
|
'uhblk;': '\u2580',
|
|
|
'ulcorn;': '\u231c',
|
|
|
'ulcorner;': '\u231c',
|
|
|
'ulcrop;': '\u230f',
|
|
|
'ultri;': '\u25f8',
|
|
|
'Umacr;': '\u016a',
|
|
|
'umacr;': '\u016b',
|
|
|
'uml': '\xa8',
|
|
|
'uml;': '\xa8',
|
|
|
'UnderBar;': '_',
|
|
|
'UnderBrace;': '\u23df',
|
|
|
'UnderBracket;': '\u23b5',
|
|
|
'UnderParenthesis;': '\u23dd',
|
|
|
'Union;': '\u22c3',
|
|
|
'UnionPlus;': '\u228e',
|
|
|
'Uogon;': '\u0172',
|
|
|
'uogon;': '\u0173',
|
|
|
'Uopf;': '\U0001d54c',
|
|
|
'uopf;': '\U0001d566',
|
|
|
'UpArrow;': '\u2191',
|
|
|
'Uparrow;': '\u21d1',
|
|
|
'uparrow;': '\u2191',
|
|
|
'UpArrowBar;': '\u2912',
|
|
|
'UpArrowDownArrow;': '\u21c5',
|
|
|
'UpDownArrow;': '\u2195',
|
|
|
'Updownarrow;': '\u21d5',
|
|
|
'updownarrow;': '\u2195',
|
|
|
'UpEquilibrium;': '\u296e',
|
|
|
'upharpoonleft;': '\u21bf',
|
|
|
'upharpoonright;': '\u21be',
|
|
|
'uplus;': '\u228e',
|
|
|
'UpperLeftArrow;': '\u2196',
|
|
|
'UpperRightArrow;': '\u2197',
|
|
|
'Upsi;': '\u03d2',
|
|
|
'upsi;': '\u03c5',
|
|
|
'upsih;': '\u03d2',
|
|
|
'Upsilon;': '\u03a5',
|
|
|
'upsilon;': '\u03c5',
|
|
|
'UpTee;': '\u22a5',
|
|
|
'UpTeeArrow;': '\u21a5',
|
|
|
'upuparrows;': '\u21c8',
|
|
|
'urcorn;': '\u231d',
|
|
|
'urcorner;': '\u231d',
|
|
|
'urcrop;': '\u230e',
|
|
|
'Uring;': '\u016e',
|
|
|
'uring;': '\u016f',
|
|
|
'urtri;': '\u25f9',
|
|
|
'Uscr;': '\U0001d4b0',
|
|
|
'uscr;': '\U0001d4ca',
|
|
|
'utdot;': '\u22f0',
|
|
|
'Utilde;': '\u0168',
|
|
|
'utilde;': '\u0169',
|
|
|
'utri;': '\u25b5',
|
|
|
'utrif;': '\u25b4',
|
|
|
'uuarr;': '\u21c8',
|
|
|
'Uuml': '\xdc',
|
|
|
'uuml': '\xfc',
|
|
|
'Uuml;': '\xdc',
|
|
|
'uuml;': '\xfc',
|
|
|
'uwangle;': '\u29a7',
|
|
|
'vangrt;': '\u299c',
|
|
|
'varepsilon;': '\u03f5',
|
|
|
'varkappa;': '\u03f0',
|
|
|
'varnothing;': '\u2205',
|
|
|
'varphi;': '\u03d5',
|
|
|
'varpi;': '\u03d6',
|
|
|
'varpropto;': '\u221d',
|
|
|
'vArr;': '\u21d5',
|
|
|
'varr;': '\u2195',
|
|
|
'varrho;': '\u03f1',
|
|
|
'varsigma;': '\u03c2',
|
|
|
'varsubsetneq;': '\u228a\ufe00',
|
|
|
'varsubsetneqq;': '\u2acb\ufe00',
|
|
|
'varsupsetneq;': '\u228b\ufe00',
|
|
|
'varsupsetneqq;': '\u2acc\ufe00',
|
|
|
'vartheta;': '\u03d1',
|
|
|
'vartriangleleft;': '\u22b2',
|
|
|
'vartriangleright;': '\u22b3',
|
|
|
'Vbar;': '\u2aeb',
|
|
|
'vBar;': '\u2ae8',
|
|
|
'vBarv;': '\u2ae9',
|
|
|
'Vcy;': '\u0412',
|
|
|
'vcy;': '\u0432',
|
|
|
'VDash;': '\u22ab',
|
|
|
'Vdash;': '\u22a9',
|
|
|
'vDash;': '\u22a8',
|
|
|
'vdash;': '\u22a2',
|
|
|
'Vdashl;': '\u2ae6',
|
|
|
'Vee;': '\u22c1',
|
|
|
'vee;': '\u2228',
|
|
|
'veebar;': '\u22bb',
|
|
|
'veeeq;': '\u225a',
|
|
|
'vellip;': '\u22ee',
|
|
|
'Verbar;': '\u2016',
|
|
|
'verbar;': '|',
|
|
|
'Vert;': '\u2016',
|
|
|
'vert;': '|',
|
|
|
'VerticalBar;': '\u2223',
|
|
|
'VerticalLine;': '|',
|
|
|
'VerticalSeparator;': '\u2758',
|
|
|
'VerticalTilde;': '\u2240',
|
|
|
'VeryThinSpace;': '\u200a',
|
|
|
'Vfr;': '\U0001d519',
|
|
|
'vfr;': '\U0001d533',
|
|
|
'vltri;': '\u22b2',
|
|
|
'vnsub;': '\u2282\u20d2',
|
|
|
'vnsup;': '\u2283\u20d2',
|
|
|
'Vopf;': '\U0001d54d',
|
|
|
'vopf;': '\U0001d567',
|
|
|
'vprop;': '\u221d',
|
|
|
'vrtri;': '\u22b3',
|
|
|
'Vscr;': '\U0001d4b1',
|
|
|
'vscr;': '\U0001d4cb',
|
|
|
'vsubnE;': '\u2acb\ufe00',
|
|
|
'vsubne;': '\u228a\ufe00',
|
|
|
'vsupnE;': '\u2acc\ufe00',
|
|
|
'vsupne;': '\u228b\ufe00',
|
|
|
'Vvdash;': '\u22aa',
|
|
|
'vzigzag;': '\u299a',
|
|
|
'Wcirc;': '\u0174',
|
|
|
'wcirc;': '\u0175',
|
|
|
'wedbar;': '\u2a5f',
|
|
|
'Wedge;': '\u22c0',
|
|
|
'wedge;': '\u2227',
|
|
|
'wedgeq;': '\u2259',
|
|
|
'weierp;': '\u2118',
|
|
|
'Wfr;': '\U0001d51a',
|
|
|
'wfr;': '\U0001d534',
|
|
|
'Wopf;': '\U0001d54e',
|
|
|
'wopf;': '\U0001d568',
|
|
|
'wp;': '\u2118',
|
|
|
'wr;': '\u2240',
|
|
|
'wreath;': '\u2240',
|
|
|
'Wscr;': '\U0001d4b2',
|
|
|
'wscr;': '\U0001d4cc',
|
|
|
'xcap;': '\u22c2',
|
|
|
'xcirc;': '\u25ef',
|
|
|
'xcup;': '\u22c3',
|
|
|
'xdtri;': '\u25bd',
|
|
|
'Xfr;': '\U0001d51b',
|
|
|
'xfr;': '\U0001d535',
|
|
|
'xhArr;': '\u27fa',
|
|
|
'xharr;': '\u27f7',
|
|
|
'Xi;': '\u039e',
|
|
|
'xi;': '\u03be',
|
|
|
'xlArr;': '\u27f8',
|
|
|
'xlarr;': '\u27f5',
|
|
|
'xmap;': '\u27fc',
|
|
|
'xnis;': '\u22fb',
|
|
|
'xodot;': '\u2a00',
|
|
|
'Xopf;': '\U0001d54f',
|
|
|
'xopf;': '\U0001d569',
|
|
|
'xoplus;': '\u2a01',
|
|
|
'xotime;': '\u2a02',
|
|
|
'xrArr;': '\u27f9',
|
|
|
'xrarr;': '\u27f6',
|
|
|
'Xscr;': '\U0001d4b3',
|
|
|
'xscr;': '\U0001d4cd',
|
|
|
'xsqcup;': '\u2a06',
|
|
|
'xuplus;': '\u2a04',
|
|
|
'xutri;': '\u25b3',
|
|
|
'xvee;': '\u22c1',
|
|
|
'xwedge;': '\u22c0',
|
|
|
'Yacute': '\xdd',
|
|
|
'yacute': '\xfd',
|
|
|
'Yacute;': '\xdd',
|
|
|
'yacute;': '\xfd',
|
|
|
'YAcy;': '\u042f',
|
|
|
'yacy;': '\u044f',
|
|
|
'Ycirc;': '\u0176',
|
|
|
'ycirc;': '\u0177',
|
|
|
'Ycy;': '\u042b',
|
|
|
'ycy;': '\u044b',
|
|
|
'yen': '\xa5',
|
|
|
'yen;': '\xa5',
|
|
|
'Yfr;': '\U0001d51c',
|
|
|
'yfr;': '\U0001d536',
|
|
|
'YIcy;': '\u0407',
|
|
|
'yicy;': '\u0457',
|
|
|
'Yopf;': '\U0001d550',
|
|
|
'yopf;': '\U0001d56a',
|
|
|
'Yscr;': '\U0001d4b4',
|
|
|
'yscr;': '\U0001d4ce',
|
|
|
'YUcy;': '\u042e',
|
|
|
'yucy;': '\u044e',
|
|
|
'yuml': '\xff',
|
|
|
'Yuml;': '\u0178',
|
|
|
'yuml;': '\xff',
|
|
|
'Zacute;': '\u0179',
|
|
|
'zacute;': '\u017a',
|
|
|
'Zcaron;': '\u017d',
|
|
|
'zcaron;': '\u017e',
|
|
|
'Zcy;': '\u0417',
|
|
|
'zcy;': '\u0437',
|
|
|
'Zdot;': '\u017b',
|
|
|
'zdot;': '\u017c',
|
|
|
'zeetrf;': '\u2128',
|
|
|
'ZeroWidthSpace;': '\u200b',
|
|
|
'Zeta;': '\u0396',
|
|
|
'zeta;': '\u03b6',
|
|
|
'Zfr;': '\u2128',
|
|
|
'zfr;': '\U0001d537',
|
|
|
'ZHcy;': '\u0416',
|
|
|
'zhcy;': '\u0436',
|
|
|
'zigrarr;': '\u21dd',
|
|
|
'Zopf;': '\u2124',
|
|
|
'zopf;': '\U0001d56b',
|
|
|
'Zscr;': '\U0001d4b5',
|
|
|
'zscr;': '\U0001d4cf',
|
|
|
'zwj;': '\u200d',
|
|
|
'zwnj;': '\u200c',
|
|
|
}
|
|
|
|
|
|
|
|
|
class EntitySubstitution(object):
|
|
|
"""The ability to substitute XML or HTML entities for certain characters."""
|
|
|
|
|
|
def _populate_class_variables():
|
|
|
"""Initialize variables used by this class to manage the plethora of
|
|
|
HTML5 named entities.
|
|
|
|
|
|
This function returns a 3-tuple containing two dictionaries
|
|
|
and a regular expression:
|
|
|
|
|
|
unicode_to_name - A mapping of Unicode strings like "⦨" to
|
|
|
entity names like "angmsdaa". When a single Unicode string has
|
|
|
multiple entity names, we try to choose the most commonly-used
|
|
|
name.
|
|
|
|
|
|
name_to_unicode: A mapping of entity names like "angmsdaa" to
|
|
|
Unicode strings like "⦨".
|
|
|
|
|
|
named_entity_re: A regular expression matching (almost) any
|
|
|
Unicode string that corresponds to an HTML5 named entity.
|
|
|
"""
|
|
|
unicode_to_name = {}
|
|
|
name_to_unicode = {}
|
|
|
|
|
|
short_entities = set()
|
|
|
long_entities_by_first_character = defaultdict(set)
|
|
|
|
|
|
for name_with_semicolon, character in sorted(html5.items()):
|
|
|
# "It is intentional, for legacy compatibility, that many
|
|
|
# code points have multiple character reference names. For
|
|
|
# example, some appear both with and without the trailing
|
|
|
# semicolon, or with different capitalizations."
|
|
|
# - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
|
|
|
#
|
|
|
# The parsers are in charge of handling (or not) character
|
|
|
# references with no trailing semicolon, so we remove the
|
|
|
# semicolon whenever it appears.
|
|
|
if name_with_semicolon.endswith(';'):
|
|
|
name = name_with_semicolon[:-1]
|
|
|
else:
|
|
|
name = name_with_semicolon
|
|
|
|
|
|
# When parsing HTML, we want to recognize any known named
|
|
|
# entity and convert it to a sequence of Unicode
|
|
|
# characters.
|
|
|
if name not in name_to_unicode:
|
|
|
name_to_unicode[name] = character
|
|
|
|
|
|
# When _generating_ HTML, we want to recognize special
|
|
|
# character sequences that _could_ be converted to named
|
|
|
# entities.
|
|
|
unicode_to_name[character] = name
|
|
|
|
|
|
# We also need to build a regular expression that lets us
|
|
|
# _find_ those characters in output strings so we can
|
|
|
# replace them.
|
|
|
#
|
|
|
# This is tricky, for two reasons.
|
|
|
|
|
|
if (len(character) == 1 and ord(character) < 128
|
|
|
and character not in '<>&'):
|
|
|
# First, it would be annoying to turn single ASCII
|
|
|
# characters like | into named entities like
|
|
|
# |. The exceptions are <>&, which we _must_
|
|
|
# turn into named entities to produce valid HTML.
|
|
|
continue
|
|
|
|
|
|
if len(character) > 1 and all(ord(x) < 128 for x in character):
|
|
|
# We also do not want to turn _combinations_ of ASCII
|
|
|
# characters like 'fj' into named entities like 'fj',
|
|
|
# though that's more debateable.
|
|
|
continue
|
|
|
|
|
|
# Second, some named entities have a Unicode value that's
|
|
|
# a subset of the Unicode value for some _other_ named
|
|
|
# entity. As an example, \u2267' is ≧,
|
|
|
# but '\u2267\u0338' is ≧̸. Our regular
|
|
|
# expression needs to match the first two characters of
|
|
|
# "\u2267\u0338foo", but only the first character of
|
|
|
# "\u2267foo".
|
|
|
#
|
|
|
# In this step, we build two sets of characters that
|
|
|
# _eventually_ need to go into the regular expression. But
|
|
|
# we won't know exactly what the regular expression needs
|
|
|
# to look like until we've gone through the entire list of
|
|
|
# named entities.
|
|
|
if len(character) == 1:
|
|
|
short_entities.add(character)
|
|
|
else:
|
|
|
long_entities_by_first_character[character[0]].add(character)
|
|
|
|
|
|
# Now that we've been through the entire list of entities, we
|
|
|
# can create a regular expression that matches any of them.
|
|
|
particles = set()
|
|
|
for short in short_entities:
|
|
|
long_versions = long_entities_by_first_character[short]
|
|
|
if not long_versions:
|
|
|
particles.add(short)
|
|
|
else:
|
|
|
ignore = "".join([x[1] for x in long_versions])
|
|
|
# This finds, e.g. \u2267 but only if it is _not_
|
|
|
# followed by \u0338.
|
|
|
particles.add("%s(?![%s])" % (short, ignore))
|
|
|
|
|
|
for long_entities in list(long_entities_by_first_character.values()):
|
|
|
for long_entity in long_entities:
|
|
|
particles.add(long_entity)
|
|
|
|
|
|
re_definition = "(%s)" % "|".join(particles)
|
|
|
|
|
|
# If an entity shows up in both html5 and codepoint2name, it's
|
|
|
# likely that HTML5 gives it several different names, such as
|
|
|
# 'rsquo' and 'rsquor'. When converting Unicode characters to
|
|
|
# named entities, the codepoint2name name should take
|
|
|
# precedence where possible, since that's the more easily
|
|
|
# recognizable one.
|
|
|
for codepoint, name in list(codepoint2name.items()):
|
|
|
character = chr(codepoint)
|
|
|
unicode_to_name[character] = name
|
|
|
|
|
|
return unicode_to_name, name_to_unicode, re.compile(re_definition)
|
|
|
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
|
|
|
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
|
|
|
|
|
|
CHARACTER_TO_XML_ENTITY = {
|
|
|
"'": "apos",
|
|
|
'"': "quot",
|
|
|
"&": "amp",
|
|
|
"<": "lt",
|
|
|
">": "gt",
|
|
|
}
|
|
|
|
|
|
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
|
|
"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
|
|
|
")")
|
|
|
|
|
|
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
|
|
|
|
|
@classmethod
|
|
|
def _substitute_html_entity(cls, matchobj):
|
|
|
"""Used with a regular expression to substitute the
|
|
|
appropriate HTML entity for a special character string."""
|
|
|
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
|
|
|
return "&%s;" % entity
|
|
|
|
|
|
@classmethod
|
|
|
def _substitute_xml_entity(cls, matchobj):
|
|
|
"""Used with a regular expression to substitute the
|
|
|
appropriate XML entity for a special character string."""
|
|
|
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
|
|
|
return "&%s;" % entity
|
|
|
|
|
|
@classmethod
|
|
|
def quoted_attribute_value(self, value):
|
|
|
"""Make a value into a quoted XML attribute, possibly escaping it.
|
|
|
|
|
|
Most strings will be quoted using double quotes.
|
|
|
|
|
|
Bob's Bar -> "Bob's Bar"
|
|
|
|
|
|
If a string contains double quotes, it will be quoted using
|
|
|
single quotes.
|
|
|
|
|
|
Welcome to "my bar" -> 'Welcome to "my bar"'
|
|
|
|
|
|
If a string contains both single and double quotes, the
|
|
|
double quotes will be escaped, and the string will be quoted
|
|
|
using double quotes.
|
|
|
|
|
|
Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
|
|
|
"""
|
|
|
quote_with = '"'
|
|
|
if '"' in value:
|
|
|
if "'" in value:
|
|
|
# The string contains both single and double
|
|
|
# quotes. Turn the double quotes into
|
|
|
# entities. We quote the double quotes rather than
|
|
|
# the single quotes because the entity name is
|
|
|
# """ whether this is HTML or XML. If we
|
|
|
# quoted the single quotes, we'd have to decide
|
|
|
# between ' and &squot;.
|
|
|
replace_with = """
|
|
|
value = value.replace('"', replace_with)
|
|
|
else:
|
|
|
# There are double quotes but no single quotes.
|
|
|
# We can use single quotes to quote the attribute.
|
|
|
quote_with = "'"
|
|
|
return quote_with + value + quote_with
|
|
|
|
|
|
@classmethod
|
|
|
def substitute_xml(cls, value, make_quoted_attribute=False):
|
|
|
"""Substitute XML entities for special XML characters.
|
|
|
|
|
|
:param value: A string to be substituted. The less-than sign
|
|
|
will become <, the greater-than sign will become >,
|
|
|
and any ampersands will become &. If you want ampersands
|
|
|
that appear to be part of an entity definition to be left
|
|
|
alone, use substitute_xml_containing_entities() instead.
|
|
|
|
|
|
:param make_quoted_attribute: If True, then the string will be
|
|
|
quoted, as befits an attribute value.
|
|
|
"""
|
|
|
# Escape angle brackets and ampersands.
|
|
|
value = cls.AMPERSAND_OR_BRACKET.sub(
|
|
|
cls._substitute_xml_entity, value)
|
|
|
|
|
|
if make_quoted_attribute:
|
|
|
value = cls.quoted_attribute_value(value)
|
|
|
return value
|
|
|
|
|
|
@classmethod
|
|
|
def substitute_xml_containing_entities(
|
|
|
cls, value, make_quoted_attribute=False):
|
|
|
"""Substitute XML entities for special XML characters.
|
|
|
|
|
|
:param value: A string to be substituted. The less-than sign will
|
|
|
become <, the greater-than sign will become >, and any
|
|
|
ampersands that are not part of an entity defition will
|
|
|
become &.
|
|
|
|
|
|
:param make_quoted_attribute: If True, then the string will be
|
|
|
quoted, as befits an attribute value.
|
|
|
"""
|
|
|
# Escape angle brackets, and ampersands that aren't part of
|
|
|
# entities.
|
|
|
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
|
|
|
cls._substitute_xml_entity, value)
|
|
|
|
|
|
if make_quoted_attribute:
|
|
|
value = cls.quoted_attribute_value(value)
|
|
|
return value
|
|
|
|
|
|
@classmethod
|
|
|
def substitute_html(cls, s):
|
|
|
"""Replace certain Unicode characters with named HTML entities.
|
|
|
|
|
|
This differs from data.encode(encoding, 'xmlcharrefreplace')
|
|
|
in that the goal is to make the result more readable (to those
|
|
|
with ASCII displays) rather than to recover from
|
|
|
errors. There's absolutely nothing wrong with a UTF-8 string
|
|
|
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
|
|
|
character with "é" will make it more readable to some
|
|
|
people.
|
|
|
|
|
|
:param s: A Unicode string.
|
|
|
"""
|
|
|
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
|
|
|
cls._substitute_html_entity, s)
|
|
|
|
|
|
|
|
|
class EncodingDetector:
|
|
|
"""Suggests a number of possible encodings for a bytestring.
|
|
|
|
|
|
Order of precedence:
|
|
|
|
|
|
1. Encodings you specifically tell EncodingDetector to try first
|
|
|
(the known_definite_encodings argument to the constructor).
|
|
|
|
|
|
2. An encoding determined by sniffing the document's byte-order mark.
|
|
|
|
|
|
3. Encodings you specifically tell EncodingDetector to try if
|
|
|
byte-order mark sniffing fails (the user_encodings argument to the
|
|
|
constructor).
|
|
|
|
|
|
4. An encoding declared within the bytestring itself, either in an
|
|
|
XML declaration (if the bytestring is to be interpreted as an XML
|
|
|
document), or in a <meta> tag (if the bytestring is to be
|
|
|
interpreted as an HTML document.)
|
|
|
|
|
|
5. An encoding detected through textual analysis by chardet,
|
|
|
cchardet, or a similar external library.
|
|
|
|
|
|
4. UTF-8.
|
|
|
|
|
|
5. Windows-1252.
|
|
|
|
|
|
"""
|
|
|
def __init__(self, markup, known_definite_encodings=None,
|
|
|
is_html=False, exclude_encodings=None,
|
|
|
user_encodings=None, override_encodings=None):
|
|
|
"""Constructor.
|
|
|
|
|
|
:param markup: Some markup in an unknown encoding.
|
|
|
|
|
|
:param known_definite_encodings: When determining the encoding
|
|
|
of `markup`, these encodings will be tried first, in
|
|
|
order. In HTML terms, this corresponds to the "known
|
|
|
definite encoding" step defined here:
|
|
|
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
|
|
|
|
|
|
:param user_encodings: These encodings will be tried after the
|
|
|
`known_definite_encodings` have been tried and failed, and
|
|
|
after an attempt to sniff the encoding by looking at a
|
|
|
byte order mark has failed. In HTML terms, this
|
|
|
corresponds to the step "user has explicitly instructed
|
|
|
the user agent to override the document's character
|
|
|
encoding", defined here:
|
|
|
https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
|
|
|
|
|
:param override_encodings: A deprecated alias for
|
|
|
known_definite_encodings. Any encodings here will be tried
|
|
|
immediately after the encodings in
|
|
|
known_definite_encodings.
|
|
|
|
|
|
:param is_html: If True, this markup is considered to be
|
|
|
HTML. Otherwise it's assumed to be XML.
|
|
|
|
|
|
:param exclude_encodings: These encodings will not be tried,
|
|
|
even if they otherwise would be.
|
|
|
|
|
|
"""
|
|
|
self.known_definite_encodings = list(known_definite_encodings or [])
|
|
|
if override_encodings:
|
|
|
self.known_definite_encodings += override_encodings
|
|
|
self.user_encodings = user_encodings or []
|
|
|
exclude_encodings = exclude_encodings or []
|
|
|
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
|
|
|
self.chardet_encoding = None
|
|
|
self.is_html = is_html
|
|
|
self.declared_encoding = None
|
|
|
|
|
|
# First order of business: strip a byte-order mark.
|
|
|
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
|
|
|
|
|
|
def _usable(self, encoding, tried):
|
|
|
"""Should we even bother to try this encoding?
|
|
|
|
|
|
:param encoding: Name of an encoding.
|
|
|
:param tried: Encodings that have already been tried. This will be modified
|
|
|
as a side effect.
|
|
|
"""
|
|
|
if encoding is not None:
|
|
|
encoding = encoding.lower()
|
|
|
if encoding in self.exclude_encodings:
|
|
|
return False
|
|
|
if encoding not in tried:
|
|
|
tried.add(encoding)
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
@property
|
|
|
def encodings(self):
|
|
|
"""Yield a number of encodings that might work for this markup.
|
|
|
|
|
|
:yield: A sequence of strings.
|
|
|
"""
|
|
|
tried = set()
|
|
|
|
|
|
# First, try the known definite encodings
|
|
|
for e in self.known_definite_encodings:
|
|
|
if self._usable(e, tried):
|
|
|
yield e
|
|
|
|
|
|
# Did the document originally start with a byte-order mark
|
|
|
# that indicated its encoding?
|
|
|
if self._usable(self.sniffed_encoding, tried):
|
|
|
yield self.sniffed_encoding
|
|
|
|
|
|
# Sniffing the byte-order mark did nothing; try the user
|
|
|
# encodings.
|
|
|
for e in self.user_encodings:
|
|
|
if self._usable(e, tried):
|
|
|
yield e
|
|
|
|
|
|
# Look within the document for an XML or HTML encoding
|
|
|
# declaration.
|
|
|
if self.declared_encoding is None:
|
|
|
self.declared_encoding = self.find_declared_encoding(
|
|
|
self.markup, self.is_html)
|
|
|
if self._usable(self.declared_encoding, tried):
|
|
|
yield self.declared_encoding
|
|
|
|
|
|
# Use third-party character set detection to guess at the
|
|
|
# encoding.
|
|
|
if self.chardet_encoding is None:
|
|
|
self.chardet_encoding = chardet_dammit(self.markup)
|
|
|
if self._usable(self.chardet_encoding, tried):
|
|
|
yield self.chardet_encoding
|
|
|
|
|
|
# As a last-ditch effort, try utf-8 and windows-1252.
|
|
|
for e in ('utf-8', 'windows-1252'):
|
|
|
if self._usable(e, tried):
|
|
|
yield e
|
|
|
|
|
|
@classmethod
|
|
|
def strip_byte_order_mark(cls, data):
|
|
|
"""If a byte-order mark is present, strip it and return the encoding it implies.
|
|
|
|
|
|
:param data: Some markup.
|
|
|
:return: A 2-tuple (modified data, implied encoding)
|
|
|
"""
|
|
|
encoding = None
|
|
|
if isinstance(data, str):
|
|
|
# Unicode data cannot have a byte-order mark.
|
|
|
return data, encoding
|
|
|
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
|
|
and (data[2:4] != '\x00\x00'):
|
|
|
encoding = 'utf-16be'
|
|
|
data = data[2:]
|
|
|
elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
|
|
|
and (data[2:4] != '\x00\x00'):
|
|
|
encoding = 'utf-16le'
|
|
|
data = data[2:]
|
|
|
elif data[:3] == b'\xef\xbb\xbf':
|
|
|
encoding = 'utf-8'
|
|
|
data = data[3:]
|
|
|
elif data[:4] == b'\x00\x00\xfe\xff':
|
|
|
encoding = 'utf-32be'
|
|
|
data = data[4:]
|
|
|
elif data[:4] == b'\xff\xfe\x00\x00':
|
|
|
encoding = 'utf-32le'
|
|
|
data = data[4:]
|
|
|
return data, encoding
|
|
|
|
|
|
@classmethod
|
|
|
def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
|
|
|
"""Given a document, tries to find its declared encoding.
|
|
|
|
|
|
An XML encoding is declared at the beginning of the document.
|
|
|
|
|
|
An HTML encoding is declared in a <meta> tag, hopefully near the
|
|
|
beginning of the document.
|
|
|
|
|
|
:param markup: Some markup.
|
|
|
:param is_html: If True, this markup is considered to be HTML. Otherwise
|
|
|
it's assumed to be XML.
|
|
|
:param search_entire_document: Since an encoding is supposed to declared near the beginning
|
|
|
of the document, most of the time it's only necessary to search a few kilobytes of data.
|
|
|
Set this to True to force this method to search the entire document.
|
|
|
"""
|
|
|
if search_entire_document:
|
|
|
xml_endpos = html_endpos = len(markup)
|
|
|
else:
|
|
|
xml_endpos = 1024
|
|
|
html_endpos = max(2048, int(len(markup) * 0.05))
|
|
|
|
|
|
if isinstance(markup, bytes):
|
|
|
res = encoding_res[bytes]
|
|
|
else:
|
|
|
res = encoding_res[str]
|
|
|
|
|
|
xml_re = res['xml']
|
|
|
html_re = res['html']
|
|
|
declared_encoding = None
|
|
|
declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
|
|
|
if not declared_encoding_match and is_html:
|
|
|
declared_encoding_match = html_re.search(markup, endpos=html_endpos)
|
|
|
if declared_encoding_match is not None:
|
|
|
declared_encoding = declared_encoding_match.groups()[0]
|
|
|
if declared_encoding:
|
|
|
if isinstance(declared_encoding, bytes):
|
|
|
declared_encoding = declared_encoding.decode('ascii', 'replace')
|
|
|
return declared_encoding.lower()
|
|
|
return None
|
|
|
|
|
|
class UnicodeDammit:
|
|
|
"""A class for detecting the encoding of a *ML document and
|
|
|
converting it to a Unicode string. If the source encoding is
|
|
|
windows-1252, can replace MS smart quotes with their HTML or XML
|
|
|
equivalents."""
|
|
|
|
|
|
# This dictionary maps commonly seen values for "charset" in HTML
|
|
|
# meta tags to the corresponding Python codec names. It only covers
|
|
|
# values that aren't in Python's aliases and can't be determined
|
|
|
# by the heuristics in find_codec.
|
|
|
CHARSET_ALIASES = {"macintosh": "mac-roman",
|
|
|
"x-sjis": "shift-jis"}
|
|
|
|
|
|
ENCODINGS_WITH_SMART_QUOTES = [
|
|
|
"windows-1252",
|
|
|
"iso-8859-1",
|
|
|
"iso-8859-2",
|
|
|
]
|
|
|
|
|
|
def __init__(self, markup, known_definite_encodings=[],
|
|
|
smart_quotes_to=None, is_html=False, exclude_encodings=[],
|
|
|
user_encodings=None, override_encodings=None
|
|
|
):
|
|
|
"""Constructor.
|
|
|
|
|
|
:param markup: A bytestring representing markup in an unknown encoding.
|
|
|
|
|
|
:param known_definite_encodings: When determining the encoding
|
|
|
of `markup`, these encodings will be tried first, in
|
|
|
order. In HTML terms, this corresponds to the "known
|
|
|
definite encoding" step defined here:
|
|
|
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
|
|
|
|
|
|
:param user_encodings: These encodings will be tried after the
|
|
|
`known_definite_encodings` have been tried and failed, and
|
|
|
after an attempt to sniff the encoding by looking at a
|
|
|
byte order mark has failed. In HTML terms, this
|
|
|
corresponds to the step "user has explicitly instructed
|
|
|
the user agent to override the document's character
|
|
|
encoding", defined here:
|
|
|
https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
|
|
|
|
|
:param override_encodings: A deprecated alias for
|
|
|
known_definite_encodings. Any encodings here will be tried
|
|
|
immediately after the encodings in
|
|
|
known_definite_encodings.
|
|
|
|
|
|
:param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
|
|
|
to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
|
|
|
Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
|
|
|
will convert them to HTML entity references.
|
|
|
:param is_html: If True, this markup is considered to be HTML. Otherwise
|
|
|
it's assumed to be XML.
|
|
|
:param exclude_encodings: These encodings will not be considered, even
|
|
|
if the sniffing code thinks they might make sense.
|
|
|
|
|
|
"""
|
|
|
self.smart_quotes_to = smart_quotes_to
|
|
|
self.tried_encodings = []
|
|
|
self.contains_replacement_characters = False
|
|
|
self.is_html = is_html
|
|
|
self.log = logging.getLogger(__name__)
|
|
|
self.detector = EncodingDetector(
|
|
|
markup, known_definite_encodings, is_html, exclude_encodings,
|
|
|
user_encodings, override_encodings
|
|
|
)
|
|
|
|
|
|
# Short-circuit if the data is in Unicode to begin with.
|
|
|
if isinstance(markup, str) or markup == '':
|
|
|
self.markup = markup
|
|
|
self.unicode_markup = str(markup)
|
|
|
self.original_encoding = None
|
|
|
return
|
|
|
|
|
|
# The encoding detector may have stripped a byte-order mark.
|
|
|
# Use the stripped markup from this point on.
|
|
|
self.markup = self.detector.markup
|
|
|
|
|
|
u = None
|
|
|
for encoding in self.detector.encodings:
|
|
|
markup = self.detector.markup
|
|
|
u = self._convert_from(encoding)
|
|
|
if u is not None:
|
|
|
break
|
|
|
|
|
|
if not u:
|
|
|
# None of the encodings worked. As an absolute last resort,
|
|
|
# try them again with character replacement.
|
|
|
|
|
|
for encoding in self.detector.encodings:
|
|
|
if encoding != "ascii":
|
|
|
u = self._convert_from(encoding, "replace")
|
|
|
if u is not None:
|
|
|
self.log.warning(
|
|
|
"Some characters could not be decoded, and were "
|
|
|
"replaced with REPLACEMENT CHARACTER."
|
|
|
)
|
|
|
self.contains_replacement_characters = True
|
|
|
break
|
|
|
|
|
|
# If none of that worked, we could at this point force it to
|
|
|
# ASCII, but that would destroy so much data that I think
|
|
|
# giving up is better.
|
|
|
self.unicode_markup = u
|
|
|
if not u:
|
|
|
self.original_encoding = None
|
|
|
|
|
|
def _sub_ms_char(self, match):
|
|
|
"""Changes a MS smart quote character to an XML or HTML
|
|
|
entity, or an ASCII character."""
|
|
|
orig = match.group(1)
|
|
|
if self.smart_quotes_to == 'ascii':
|
|
|
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
|
|
|
else:
|
|
|
sub = self.MS_CHARS.get(orig)
|
|
|
if type(sub) == tuple:
|
|
|
if self.smart_quotes_to == 'xml':
|
|
|
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
|
|
|
else:
|
|
|
sub = '&'.encode() + sub[0].encode() + ';'.encode()
|
|
|
else:
|
|
|
sub = sub.encode()
|
|
|
return sub
|
|
|
|
|
|
def _convert_from(self, proposed, errors="strict"):
|
|
|
"""Attempt to convert the markup to the proposed encoding.
|
|
|
|
|
|
:param proposed: The name of a character encoding.
|
|
|
"""
|
|
|
proposed = self.find_codec(proposed)
|
|
|
if not proposed or (proposed, errors) in self.tried_encodings:
|
|
|
return None
|
|
|
self.tried_encodings.append((proposed, errors))
|
|
|
markup = self.markup
|
|
|
# Convert smart quotes to HTML if coming from an encoding
|
|
|
# that might have them.
|
|
|
if (self.smart_quotes_to is not None
|
|
|
and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
|
|
|
smart_quotes_re = b"([\x80-\x9f])"
|
|
|
smart_quotes_compiled = re.compile(smart_quotes_re)
|
|
|
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
|
|
|
|
|
|
try:
|
|
|
#print("Trying to convert document to %s (errors=%s)" % (
|
|
|
# proposed, errors))
|
|
|
u = self._to_unicode(markup, proposed, errors)
|
|
|
self.markup = u
|
|
|
self.original_encoding = proposed
|
|
|
except Exception as e:
|
|
|
#print("That didn't work!")
|
|
|
#print(e)
|
|
|
return None
|
|
|
#print("Correct encoding: %s" % proposed)
|
|
|
return self.markup
|
|
|
|
|
|
def _to_unicode(self, data, encoding, errors="strict"):
|
|
|
"""Given a string and its encoding, decodes the string into Unicode.
|
|
|
|
|
|
:param encoding: The name of an encoding.
|
|
|
"""
|
|
|
return str(data, encoding, errors)
|
|
|
|
|
|
@property
|
|
|
def declared_html_encoding(self):
|
|
|
"""If the markup is an HTML document, returns the encoding declared _within_
|
|
|
the document.
|
|
|
"""
|
|
|
if not self.is_html:
|
|
|
return None
|
|
|
return self.detector.declared_encoding
|
|
|
|
|
|
def find_codec(self, charset):
|
|
|
"""Convert the name of a character set to a codec name.
|
|
|
|
|
|
:param charset: The name of a character set.
|
|
|
:return: The name of a codec.
|
|
|
"""
|
|
|
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
|
|
|
or (charset and self._codec(charset.replace("-", "")))
|
|
|
or (charset and self._codec(charset.replace("-", "_")))
|
|
|
or (charset and charset.lower())
|
|
|
or charset
|
|
|
)
|
|
|
if value:
|
|
|
return value.lower()
|
|
|
return None
|
|
|
|
|
|
def _codec(self, charset):
|
|
|
if not charset:
|
|
|
return charset
|
|
|
codec = None
|
|
|
try:
|
|
|
codecs.lookup(charset)
|
|
|
codec = charset
|
|
|
except (LookupError, ValueError):
|
|
|
pass
|
|
|
return codec
|
|
|
|
|
|
|
|
|
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
|
|
|
MS_CHARS = {b'\x80': ('euro', '20AC'),
|
|
|
b'\x81': ' ',
|
|
|
b'\x82': ('sbquo', '201A'),
|
|
|
b'\x83': ('fnof', '192'),
|
|
|
b'\x84': ('bdquo', '201E'),
|
|
|
b'\x85': ('hellip', '2026'),
|
|
|
b'\x86': ('dagger', '2020'),
|
|
|
b'\x87': ('Dagger', '2021'),
|
|
|
b'\x88': ('circ', '2C6'),
|
|
|
b'\x89': ('permil', '2030'),
|
|
|
b'\x8A': ('Scaron', '160'),
|
|
|
b'\x8B': ('lsaquo', '2039'),
|
|
|
b'\x8C': ('OElig', '152'),
|
|
|
b'\x8D': '?',
|
|
|
b'\x8E': ('#x17D', '17D'),
|
|
|
b'\x8F': '?',
|
|
|
b'\x90': '?',
|
|
|
b'\x91': ('lsquo', '2018'),
|
|
|
b'\x92': ('rsquo', '2019'),
|
|
|
b'\x93': ('ldquo', '201C'),
|
|
|
b'\x94': ('rdquo', '201D'),
|
|
|
b'\x95': ('bull', '2022'),
|
|
|
b'\x96': ('ndash', '2013'),
|
|
|
b'\x97': ('mdash', '2014'),
|
|
|
b'\x98': ('tilde', '2DC'),
|
|
|
b'\x99': ('trade', '2122'),
|
|
|
b'\x9a': ('scaron', '161'),
|
|
|
b'\x9b': ('rsaquo', '203A'),
|
|
|
b'\x9c': ('oelig', '153'),
|
|
|
b'\x9d': '?',
|
|
|
b'\x9e': ('#x17E', '17E'),
|
|
|
b'\x9f': ('Yuml', ''),}
|
|
|
|
|
|
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
|
|
|
# horrors like stripping diacritical marks to turn á into a, but also
|
|
|
# contains non-horrors like turning “ into ".
|
|
|
MS_CHARS_TO_ASCII = {
|
|
|
b'\x80' : 'EUR',
|
|
|
b'\x81' : ' ',
|
|
|
b'\x82' : ',',
|
|
|
b'\x83' : 'f',
|
|
|
b'\x84' : ',,',
|
|
|
b'\x85' : '...',
|
|
|
b'\x86' : '+',
|
|
|
b'\x87' : '++',
|
|
|
b'\x88' : '^',
|
|
|
b'\x89' : '%',
|
|
|
b'\x8a' : 'S',
|
|
|
b'\x8b' : '<',
|
|
|
b'\x8c' : 'OE',
|
|
|
b'\x8d' : '?',
|
|
|
b'\x8e' : 'Z',
|
|
|
b'\x8f' : '?',
|
|
|
b'\x90' : '?',
|
|
|
b'\x91' : "'",
|
|
|
b'\x92' : "'",
|
|
|
b'\x93' : '"',
|
|
|
b'\x94' : '"',
|
|
|
b'\x95' : '*',
|
|
|
b'\x96' : '-',
|
|
|
b'\x97' : '--',
|
|
|
b'\x98' : '~',
|
|
|
b'\x99' : '(TM)',
|
|
|
b'\x9a' : 's',
|
|
|
b'\x9b' : '>',
|
|
|
b'\x9c' : 'oe',
|
|
|
b'\x9d' : '?',
|
|
|
b'\x9e' : 'z',
|
|
|
b'\x9f' : 'Y',
|
|
|
b'\xa0' : ' ',
|
|
|
b'\xa1' : '!',
|
|
|
b'\xa2' : 'c',
|
|
|
b'\xa3' : 'GBP',
|
|
|
b'\xa4' : '$', #This approximation is especially parochial--this is the
|
|
|
#generic currency symbol.
|
|
|
b'\xa5' : 'YEN',
|
|
|
b'\xa6' : '|',
|
|
|
b'\xa7' : 'S',
|
|
|
b'\xa8' : '..',
|
|
|
b'\xa9' : '',
|
|
|
b'\xaa' : '(th)',
|
|
|
b'\xab' : '<<',
|
|
|
b'\xac' : '!',
|
|
|
b'\xad' : ' ',
|
|
|
b'\xae' : '(R)',
|
|
|
b'\xaf' : '-',
|
|
|
b'\xb0' : 'o',
|
|
|
b'\xb1' : '+-',
|
|
|
b'\xb2' : '2',
|
|
|
b'\xb3' : '3',
|
|
|
b'\xb4' : ("'", 'acute'),
|
|
|
b'\xb5' : 'u',
|
|
|
b'\xb6' : 'P',
|
|
|
b'\xb7' : '*',
|
|
|
b'\xb8' : ',',
|
|
|
b'\xb9' : '1',
|
|
|
b'\xba' : '(th)',
|
|
|
b'\xbb' : '>>',
|
|
|
b'\xbc' : '1/4',
|
|
|
b'\xbd' : '1/2',
|
|
|
b'\xbe' : '3/4',
|
|
|
b'\xbf' : '?',
|
|
|
b'\xc0' : 'A',
|
|
|
b'\xc1' : 'A',
|
|
|
b'\xc2' : 'A',
|
|
|
b'\xc3' : 'A',
|
|
|
b'\xc4' : 'A',
|
|
|
b'\xc5' : 'A',
|
|
|
b'\xc6' : 'AE',
|
|
|
b'\xc7' : 'C',
|
|
|
b'\xc8' : 'E',
|
|
|
b'\xc9' : 'E',
|
|
|
b'\xca' : 'E',
|
|
|
b'\xcb' : 'E',
|
|
|
b'\xcc' : 'I',
|
|
|
b'\xcd' : 'I',
|
|
|
b'\xce' : 'I',
|
|
|
b'\xcf' : 'I',
|
|
|
b'\xd0' : 'D',
|
|
|
b'\xd1' : 'N',
|
|
|
b'\xd2' : 'O',
|
|
|
b'\xd3' : 'O',
|
|
|
b'\xd4' : 'O',
|
|
|
b'\xd5' : 'O',
|
|
|
b'\xd6' : 'O',
|
|
|
b'\xd7' : '*',
|
|
|
b'\xd8' : 'O',
|
|
|
b'\xd9' : 'U',
|
|
|
b'\xda' : 'U',
|
|
|
b'\xdb' : 'U',
|
|
|
b'\xdc' : 'U',
|
|
|
b'\xdd' : 'Y',
|
|
|
b'\xde' : 'b',
|
|
|
b'\xdf' : 'B',
|
|
|
b'\xe0' : 'a',
|
|
|
b'\xe1' : 'a',
|
|
|
b'\xe2' : 'a',
|
|
|
b'\xe3' : 'a',
|
|
|
b'\xe4' : 'a',
|
|
|
b'\xe5' : 'a',
|
|
|
b'\xe6' : 'ae',
|
|
|
b'\xe7' : 'c',
|
|
|
b'\xe8' : 'e',
|
|
|
b'\xe9' : 'e',
|
|
|
b'\xea' : 'e',
|
|
|
b'\xeb' : 'e',
|
|
|
b'\xec' : 'i',
|
|
|
b'\xed' : 'i',
|
|
|
b'\xee' : 'i',
|
|
|
b'\xef' : 'i',
|
|
|
b'\xf0' : 'o',
|
|
|
b'\xf1' : 'n',
|
|
|
b'\xf2' : 'o',
|
|
|
b'\xf3' : 'o',
|
|
|
b'\xf4' : 'o',
|
|
|
b'\xf5' : 'o',
|
|
|
b'\xf6' : 'o',
|
|
|
b'\xf7' : '/',
|
|
|
b'\xf8' : 'o',
|
|
|
b'\xf9' : 'u',
|
|
|
b'\xfa' : 'u',
|
|
|
b'\xfb' : 'u',
|
|
|
b'\xfc' : 'u',
|
|
|
b'\xfd' : 'y',
|
|
|
b'\xfe' : 'b',
|
|
|
b'\xff' : 'y',
|
|
|
}
|
|
|
|
|
|
# A map used when removing rogue Windows-1252/ISO-8859-1
|
|
|
# characters in otherwise UTF-8 documents.
|
|
|
#
|
|
|
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
|
|
|
# Windows-1252.
|
|
|
WINDOWS_1252_TO_UTF8 = {
|
|
|
0x80 : b'\xe2\x82\xac', # €
|
|
|
0x82 : b'\xe2\x80\x9a', # ‚
|
|
|
0x83 : b'\xc6\x92', # ƒ
|
|
|
0x84 : b'\xe2\x80\x9e', # „
|
|
|
0x85 : b'\xe2\x80\xa6', # …
|
|
|
0x86 : b'\xe2\x80\xa0', # †
|
|
|
0x87 : b'\xe2\x80\xa1', # ‡
|
|
|
0x88 : b'\xcb\x86', # ˆ
|
|
|
0x89 : b'\xe2\x80\xb0', # ‰
|
|
|
0x8a : b'\xc5\xa0', # Š
|
|
|
0x8b : b'\xe2\x80\xb9', # ‹
|
|
|
0x8c : b'\xc5\x92', # Œ
|
|
|
0x8e : b'\xc5\xbd', # Ž
|
|
|
0x91 : b'\xe2\x80\x98', # ‘
|
|
|
0x92 : b'\xe2\x80\x99', # ’
|
|
|
0x93 : b'\xe2\x80\x9c', # “
|
|
|
0x94 : b'\xe2\x80\x9d', # ”
|
|
|
0x95 : b'\xe2\x80\xa2', # •
|
|
|
0x96 : b'\xe2\x80\x93', # –
|
|
|
0x97 : b'\xe2\x80\x94', # —
|
|
|
0x98 : b'\xcb\x9c', # ˜
|
|
|
0x99 : b'\xe2\x84\xa2', # ™
|
|
|
0x9a : b'\xc5\xa1', # š
|
|
|
0x9b : b'\xe2\x80\xba', # ›
|
|
|
0x9c : b'\xc5\x93', # œ
|
|
|
0x9e : b'\xc5\xbe', # ž
|
|
|
0x9f : b'\xc5\xb8', # Ÿ
|
|
|
0xa0 : b'\xc2\xa0', #
|
|
|
0xa1 : b'\xc2\xa1', # ¡
|
|
|
0xa2 : b'\xc2\xa2', # ¢
|
|
|
0xa3 : b'\xc2\xa3', # £
|
|
|
0xa4 : b'\xc2\xa4', # ¤
|
|
|
0xa5 : b'\xc2\xa5', # ¥
|
|
|
0xa6 : b'\xc2\xa6', # ¦
|
|
|
0xa7 : b'\xc2\xa7', # §
|
|
|
0xa8 : b'\xc2\xa8', # ¨
|
|
|
0xa9 : b'\xc2\xa9', # ©
|
|
|
0xaa : b'\xc2\xaa', # ª
|
|
|
0xab : b'\xc2\xab', # «
|
|
|
0xac : b'\xc2\xac', # ¬
|
|
|
0xad : b'\xc2\xad', #
|
|
|
0xae : b'\xc2\xae', # ®
|
|
|
0xaf : b'\xc2\xaf', # ¯
|
|
|
0xb0 : b'\xc2\xb0', # °
|
|
|
0xb1 : b'\xc2\xb1', # ±
|
|
|
0xb2 : b'\xc2\xb2', # ²
|
|
|
0xb3 : b'\xc2\xb3', # ³
|
|
|
0xb4 : b'\xc2\xb4', # ´
|
|
|
0xb5 : b'\xc2\xb5', # µ
|
|
|
0xb6 : b'\xc2\xb6', # ¶
|
|
|
0xb7 : b'\xc2\xb7', # ·
|
|
|
0xb8 : b'\xc2\xb8', # ¸
|
|
|
0xb9 : b'\xc2\xb9', # ¹
|
|
|
0xba : b'\xc2\xba', # º
|
|
|
0xbb : b'\xc2\xbb', # »
|
|
|
0xbc : b'\xc2\xbc', # ¼
|
|
|
0xbd : b'\xc2\xbd', # ½
|
|
|
0xbe : b'\xc2\xbe', # ¾
|
|
|
0xbf : b'\xc2\xbf', # ¿
|
|
|
0xc0 : b'\xc3\x80', # À
|
|
|
0xc1 : b'\xc3\x81', # Á
|
|
|
0xc2 : b'\xc3\x82', # Â
|
|
|
0xc3 : b'\xc3\x83', # Ã
|
|
|
0xc4 : b'\xc3\x84', # Ä
|
|
|
0xc5 : b'\xc3\x85', # Å
|
|
|
0xc6 : b'\xc3\x86', # Æ
|
|
|
0xc7 : b'\xc3\x87', # Ç
|
|
|
0xc8 : b'\xc3\x88', # È
|
|
|
0xc9 : b'\xc3\x89', # É
|
|
|
0xca : b'\xc3\x8a', # Ê
|
|
|
0xcb : b'\xc3\x8b', # Ë
|
|
|
0xcc : b'\xc3\x8c', # Ì
|
|
|
0xcd : b'\xc3\x8d', # Í
|
|
|
0xce : b'\xc3\x8e', # Î
|
|
|
0xcf : b'\xc3\x8f', # Ï
|
|
|
0xd0 : b'\xc3\x90', # Ð
|
|
|
0xd1 : b'\xc3\x91', # Ñ
|
|
|
0xd2 : b'\xc3\x92', # Ò
|
|
|
0xd3 : b'\xc3\x93', # Ó
|
|
|
0xd4 : b'\xc3\x94', # Ô
|
|
|
0xd5 : b'\xc3\x95', # Õ
|
|
|
0xd6 : b'\xc3\x96', # Ö
|
|
|
0xd7 : b'\xc3\x97', # ×
|
|
|
0xd8 : b'\xc3\x98', # Ø
|
|
|
0xd9 : b'\xc3\x99', # Ù
|
|
|
0xda : b'\xc3\x9a', # Ú
|
|
|
0xdb : b'\xc3\x9b', # Û
|
|
|
0xdc : b'\xc3\x9c', # Ü
|
|
|
0xdd : b'\xc3\x9d', # Ý
|
|
|
0xde : b'\xc3\x9e', # Þ
|
|
|
0xdf : b'\xc3\x9f', # ß
|
|
|
0xe0 : b'\xc3\xa0', # à
|
|
|
0xe1 : b'\xa1', # á
|
|
|
0xe2 : b'\xc3\xa2', # â
|
|
|
0xe3 : b'\xc3\xa3', # ã
|
|
|
0xe4 : b'\xc3\xa4', # ä
|
|
|
0xe5 : b'\xc3\xa5', # å
|
|
|
0xe6 : b'\xc3\xa6', # æ
|
|
|
0xe7 : b'\xc3\xa7', # ç
|
|
|
0xe8 : b'\xc3\xa8', # è
|
|
|
0xe9 : b'\xc3\xa9', # é
|
|
|
0xea : b'\xc3\xaa', # ê
|
|
|
0xeb : b'\xc3\xab', # ë
|
|
|
0xec : b'\xc3\xac', # ì
|
|
|
0xed : b'\xc3\xad', # í
|
|
|
0xee : b'\xc3\xae', # î
|
|
|
0xef : b'\xc3\xaf', # ï
|
|
|
0xf0 : b'\xc3\xb0', # ð
|
|
|
0xf1 : b'\xc3\xb1', # ñ
|
|
|
0xf2 : b'\xc3\xb2', # ò
|
|
|
0xf3 : b'\xc3\xb3', # ó
|
|
|
0xf4 : b'\xc3\xb4', # ô
|
|
|
0xf5 : b'\xc3\xb5', # õ
|
|
|
0xf6 : b'\xc3\xb6', # ö
|
|
|
0xf7 : b'\xc3\xb7', # ÷
|
|
|
0xf8 : b'\xc3\xb8', # ø
|
|
|
0xf9 : b'\xc3\xb9', # ù
|
|
|
0xfa : b'\xc3\xba', # ú
|
|
|
0xfb : b'\xc3\xbb', # û
|
|
|
0xfc : b'\xc3\xbc', # ü
|
|
|
0xfd : b'\xc3\xbd', # ý
|
|
|
0xfe : b'\xc3\xbe', # þ
|
|
|
}
|
|
|
|
|
|
MULTIBYTE_MARKERS_AND_SIZES = [
|
|
|
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
|
|
|
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
|
|
|
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
|
|
|
]
|
|
|
|
|
|
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
|
|
|
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
|
|
|
|
|
|
@classmethod
|
|
|
def detwingle(cls, in_bytes, main_encoding="utf8",
|
|
|
embedded_encoding="windows-1252"):
|
|
|
"""Fix characters from one encoding embedded in some other encoding.
|
|
|
|
|
|
Currently the only situation supported is Windows-1252 (or its
|
|
|
subset ISO-8859-1), embedded in UTF-8.
|
|
|
|
|
|
:param in_bytes: A bytestring that you suspect contains
|
|
|
characters from multiple encodings. Note that this _must_
|
|
|
be a bytestring. If you've already converted the document
|
|
|
to Unicode, you're too late.
|
|
|
:param main_encoding: The primary encoding of `in_bytes`.
|
|
|
:param embedded_encoding: The encoding that was used to embed characters
|
|
|
in the main document.
|
|
|
:return: A bytestring in which `embedded_encoding`
|
|
|
characters have been converted to their `main_encoding`
|
|
|
equivalents.
|
|
|
"""
|
|
|
if embedded_encoding.replace('_', '-').lower() not in (
|
|
|
'windows-1252', 'windows_1252'):
|
|
|
raise NotImplementedError(
|
|
|
"Windows-1252 and ISO-8859-1 are the only currently supported "
|
|
|
"embedded encodings.")
|
|
|
|
|
|
if main_encoding.lower() not in ('utf8', 'utf-8'):
|
|
|
raise NotImplementedError(
|
|
|
"UTF-8 is the only currently supported main encoding.")
|
|
|
|
|
|
byte_chunks = []
|
|
|
|
|
|
chunk_start = 0
|
|
|
pos = 0
|
|
|
while pos < len(in_bytes):
|
|
|
byte = in_bytes[pos]
|
|
|
if not isinstance(byte, int):
|
|
|
# Python 2.x
|
|
|
byte = ord(byte)
|
|
|
if (byte >= cls.FIRST_MULTIBYTE_MARKER
|
|
|
and byte <= cls.LAST_MULTIBYTE_MARKER):
|
|
|
# This is the start of a UTF-8 multibyte character. Skip
|
|
|
# to the end.
|
|
|
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
|
|
|
if byte >= start and byte <= end:
|
|
|
pos += size
|
|
|
break
|
|
|
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
|
|
|
# We found a Windows-1252 character!
|
|
|
# Save the string up to this point as a chunk.
|
|
|
byte_chunks.append(in_bytes[chunk_start:pos])
|
|
|
|
|
|
# Now translate the Windows-1252 character into UTF-8
|
|
|
# and add it as another, one-byte chunk.
|
|
|
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
|
|
|
pos += 1
|
|
|
chunk_start = pos
|
|
|
else:
|
|
|
# Go on to the next character.
|
|
|
pos += 1
|
|
|
if chunk_start == 0:
|
|
|
# The string is unchanged.
|
|
|
return in_bytes
|
|
|
else:
|
|
|
# Store the final chunk.
|
|
|
byte_chunks.append(in_bytes[chunk_start:])
|
|
|
return b''.join(byte_chunks)
|
|
|
|