using System; namespace NLangDetect.Core.Extensions { public static class CharExtensions { private const int MIN_CODE_POINT = 0x000000; private const int MAX_CODE_POINT = 0x10ffff; private static readonly int[] _unicodeBlockStarts = { #region Unicode block starts 0x0000, // Basic Latin 0x0080, // Latin-1 Supplement 0x0100, // Latin Extended-A 0x0180, // Latin Extended-B 0x0250, // IPA Extensions 0x02B0, // Spacing Modifier Letters 0x0300, // Combining Diacritical Marks 0x0370, // Greek and Coptic 0x0400, // Cyrillic 0x0500, // Cyrillic Supplementary 0x0530, // Armenian 0x0590, // Hebrew 0x0600, // Arabic 0x0700, // Syriac 0x0750, // unassigned 0x0780, // Thaana 0x07C0, // unassigned 0x0900, // Devanagari 0x0980, // Bengali 0x0A00, // Gurmukhi 0x0A80, // Gujarati 0x0B00, // Oriya 0x0B80, // Tamil 0x0C00, // Telugu 0x0C80, // Kannada 0x0D00, // Malayalam 0x0D80, // Sinhala 0x0E00, // Thai 0x0E80, // Lao 0x0F00, // Tibetan 0x1000, // Myanmar 0x10A0, // Georgian 0x1100, // Hangul Jamo 0x1200, // Ethiopic 0x1380, // unassigned 0x13A0, // Cherokee 0x1400, // Unified Canadian Aboriginal Syllabics 0x1680, // Ogham 0x16A0, // Runic 0x1700, // Tagalog 0x1720, // Hanunoo 0x1740, // Buhid 0x1760, // Tagbanwa 0x1780, // Khmer 0x1800, // Mongolian 0x18B0, // unassigned 0x1900, // Limbu 0x1950, // Tai Le 0x1980, // unassigned 0x19E0, // Khmer Symbols 0x1A00, // unassigned 0x1D00, // Phonetic Extensions 0x1D80, // unassigned 0x1E00, // Latin Extended Additional 0x1F00, // Greek Extended 0x2000, // General Punctuation 0x2070, // Superscripts and Subscripts 0x20A0, // Currency Symbols 0x20D0, // Combining Diacritical Marks for Symbols 0x2100, // Letterlike Symbols 0x2150, // Number Forms 0x2190, // Arrows 0x2200, // Mathematical Operators 0x2300, // Miscellaneous Technical 0x2400, // Control Pictures 0x2440, // Optical Character Recognition 0x2460, // Enclosed Alphanumerics 0x2500, // Box Drawing 0x2580, // Block Elements 0x25A0, // Geometric Shapes 0x2600, // Miscellaneous Symbols 0x2700, // Dingbats 0x27C0, // Miscellaneous Mathematical Symbols-A 0x27F0, // Supplemental Arrows-A 0x2800, // Braille Patterns 0x2900, // Supplemental Arrows-B 0x2980, // Miscellaneous Mathematical Symbols-B 0x2A00, // Supplemental Mathematical Operators 0x2B00, // Miscellaneous Symbols and Arrows 0x2C00, // unassigned 0x2E80, // CJK Radicals Supplement 0x2F00, // Kangxi Radicals 0x2FE0, // unassigned 0x2FF0, // Ideographic Description Characters 0x3000, // CJK Symbols and Punctuation 0x3040, // Hiragana 0x30A0, // Katakana 0x3100, // Bopomofo 0x3130, // Hangul Compatibility Jamo 0x3190, // Kanbun 0x31A0, // Bopomofo Extended 0x31C0, // unassigned 0x31F0, // Katakana Phonetic Extensions 0x3200, // Enclosed CJK Letters and Months 0x3300, // CJK Compatibility 0x3400, // CJK Unified Ideographs Extension A 0x4DC0, // Yijing Hexagram Symbols 0x4E00, // CJK Unified Ideographs 0xA000, // Yi Syllables 0xA490, // Yi Radicals 0xA4D0, // unassigned 0xAC00, // Hangul Syllables 0xD7B0, // unassigned 0xD800, // High Surrogates 0xDB80, // High Private Use Surrogates 0xDC00, // Low Surrogates 0xE000, // Private Use 0xF900, // CJK Compatibility Ideographs 0xFB00, // Alphabetic Presentation Forms 0xFB50, // Arabic Presentation Forms-A 0xFE00, // Variation Selectors 0xFE10, // unassigned 0xFE20, // Combining Half Marks 0xFE30, // CJK Compatibility Forms 0xFE50, // Small Form Variants 0xFE70, // Arabic Presentation Forms-B 0xFF00, // Halfwidth and Fullwidth Forms 0xFFF0, // Specials 0x10000, // Linear B Syllabary 0x10080, // Linear B Ideograms 0x10100, // Aegean Numbers 0x10140, // unassigned 0x10300, // Old Italic 0x10330, // Gothic 0x10350, // unassigned 0x10380, // Ugaritic 0x103A0, // unassigned 0x10400, // Deseret 0x10450, // Shavian 0x10480, // Osmanya 0x104B0, // unassigned 0x10800, // Cypriot Syllabary 0x10840, // unassigned 0x1D000, // Byzantine Musical Symbols 0x1D100, // Musical Symbols 0x1D200, // unassigned 0x1D300, // Tai Xuan Jing Symbols 0x1D360, // unassigned 0x1D400, // Mathematical Alphanumeric Symbols 0x1D800, // unassigned 0x20000, // CJK Unified Ideographs Extension B 0x2A6E0, // unassigned 0x2F800, // CJK Compatibility Ideographs Supplement 0x2FA20, // unassigned 0xE0000, // Tags 0xE0080, // unassigned 0xE0100, // Variation Selectors Supplement 0xE01F0, // unassigned 0xF0000, // Supplementary Private Use Area-A 0x100000, // Supplementary Private Use Area-B #endregion }; private static readonly UnicodeBlock?[] _unicodeBlocks = { #region Unicode blocks UnicodeBlock.BasicLatin, UnicodeBlock.Latin1Supplement, UnicodeBlock.LatinExtendedA, UnicodeBlock.LatinExtendedB, UnicodeBlock.IpaExtensions, UnicodeBlock.SpacingModifierLetters, UnicodeBlock.CombiningDiacriticalMarks, UnicodeBlock.Greek, UnicodeBlock.Cyrillic, UnicodeBlock.CyrillicSupplementary, UnicodeBlock.Armenian, UnicodeBlock.Hebrew, UnicodeBlock.Arabic, UnicodeBlock.Syriac, null, UnicodeBlock.Thaana, null, UnicodeBlock.Devanagari, UnicodeBlock.Bengali, UnicodeBlock.Gurmukhi, UnicodeBlock.Gujarati, UnicodeBlock.Oriya, UnicodeBlock.Tamil, UnicodeBlock.Telugu, UnicodeBlock.Kannada, UnicodeBlock.Malayalam, UnicodeBlock.Sinhala, UnicodeBlock.Thai, UnicodeBlock.Lao, UnicodeBlock.Tibetan, UnicodeBlock.Myanmar, UnicodeBlock.Georgian, UnicodeBlock.HangulJamo, UnicodeBlock.Ethiopic, null, UnicodeBlock.Cherokee, UnicodeBlock.UnifiedCanadianAboriginalSyllabics, UnicodeBlock.Ogham, UnicodeBlock.Runic, UnicodeBlock.Tagalog, UnicodeBlock.Hanunoo, UnicodeBlock.Buhid, UnicodeBlock.Tagbanwa, UnicodeBlock.Khmer, UnicodeBlock.Mongolian, null, UnicodeBlock.Limbu, UnicodeBlock.TaiLe, null, UnicodeBlock.KhmerSymbols, null, UnicodeBlock.PhoneticExtensions, null, UnicodeBlock.LatinExtendedAdditional, UnicodeBlock.GreekExtended, UnicodeBlock.GeneralPunctuation, UnicodeBlock.SuperscriptsAndSubscripts, UnicodeBlock.CurrencySymbols, UnicodeBlock.CombiningMarksForSymbols, UnicodeBlock.LetterlikeSymbols, UnicodeBlock.NumberForms, UnicodeBlock.Arrows, UnicodeBlock.MathematicalOperators, UnicodeBlock.MiscellaneousTechnical, UnicodeBlock.ControlPictures, UnicodeBlock.OpticalCharacterRecognition, UnicodeBlock.EnclosedAlphanumerics, UnicodeBlock.BoxDrawing, UnicodeBlock.BlockElements, UnicodeBlock.GeometricShapes, UnicodeBlock.MiscellaneousSymbols, UnicodeBlock.Dingbats, UnicodeBlock.MiscellaneousMathematicalSymbolsA, UnicodeBlock.SupplementalArrowsA, UnicodeBlock.BraillePatterns, UnicodeBlock.SupplementalArrowsB, UnicodeBlock.MiscellaneousMathematicalSymbolsB, UnicodeBlock.SupplementalMathematicalOperators, UnicodeBlock.MiscellaneousSymbolsAndArrows, null, UnicodeBlock.CjkRadicalsSupplement, UnicodeBlock.KangxiRadicals, null, UnicodeBlock.IdeographicDescriptionCharacters, UnicodeBlock.CjkSymbolsAndPunctuation, UnicodeBlock.Hiragana, UnicodeBlock.Katakana, UnicodeBlock.Bopomofo, UnicodeBlock.HangulCompatibilityJamo, UnicodeBlock.Kanbun, UnicodeBlock.BopomofoExtended, null, UnicodeBlock.KatakanaPhoneticExtensions, UnicodeBlock.EnclosedCjkLettersAndMonths, UnicodeBlock.CjkCompatibility, UnicodeBlock.CjkUnifiedIdeographsExtensionA, UnicodeBlock.YijingHexagramSymbols, UnicodeBlock.CjkUnifiedIdeographs, UnicodeBlock.YiSyllables, UnicodeBlock.YiRadicals, null, UnicodeBlock.HangulSyllables, null, UnicodeBlock.HighSurrogates, UnicodeBlock.HighPrivateUseSurrogates, UnicodeBlock.LowSurrogates, UnicodeBlock.PrivateUseArea, UnicodeBlock.CjkCompatibilityIdeographs, UnicodeBlock.AlphabeticPresentationForms, UnicodeBlock.ArabicPresentationFormsA, UnicodeBlock.VariationSelectors, null, UnicodeBlock.CombiningHalfMarks, UnicodeBlock.CjkCompatibilityForms, UnicodeBlock.SmallFormVariants, UnicodeBlock.ArabicPresentationFormsB, UnicodeBlock.HalfwidthAndFullwidthForms, UnicodeBlock.Specials, UnicodeBlock.LinearBSyllabary, UnicodeBlock.LinearBIdeograms, UnicodeBlock.AegeanNumbers, null, UnicodeBlock.OldItalic, UnicodeBlock.Gothic, null, UnicodeBlock.Ugaritic, null, UnicodeBlock.Deseret, UnicodeBlock.Shavian, UnicodeBlock.Osmanya, null, UnicodeBlock.CypriotSyllabary, null, UnicodeBlock.ByzantineMusicalSymbols, UnicodeBlock.MusicalSymbols, null, UnicodeBlock.TaiXuanJingSymbols, null, UnicodeBlock.MathematicalAlphanumericSymbols, null, UnicodeBlock.CjkUnifiedIdeographsExtensionB, null, UnicodeBlock.CjkCompatibilityIdeographsSupplement, null, UnicodeBlock.Tags, null, UnicodeBlock.VariationSelectorsSupplement, null, UnicodeBlock.SupplementaryPrivateUseAreaA, UnicodeBlock.SupplementaryPrivateUseAreaB, #endregion }; #region Public methods /// /// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL /// public static UnicodeBlock? GetUnicodeBlock(this char ch) { int codePoint = ch; if (!IsValidCodePoint(codePoint)) { throw new ArgumentException("Argument is not a valid code point.", nameof(ch)); } int top, bottom, current; bottom = 0; top = _unicodeBlockStarts.Length; current = top / 2; // invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom] while (top - bottom > 1) { if (codePoint >= _unicodeBlockStarts[current]) { bottom = current; } else { top = current; } current = (top + bottom) / 2; } return _unicodeBlocks[current]; } #endregion #region Private helper methods private static bool IsValidCodePoint(int codePoint) { return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT; } #endregion } }