commit
f525f5a89e
@ -0,0 +1,125 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System.IO;
|
||||
|
||||
namespace UniversalDetector
|
||||
{
|
||||
/// <summary>
|
||||
/// Default implementation of charset detection interface.
|
||||
/// The detector can be fed by a System.IO.Stream:
|
||||
/// <example>
|
||||
/// <code>
|
||||
/// using (FileStream fs = File.OpenRead(filename)) {
|
||||
/// CharsetDetector cdet = new CharsetDetector();
|
||||
/// cdet.Feed(fs);
|
||||
/// cdet.DataEnd();
|
||||
/// Console.WriteLine("{0}, {1}", cdet.Charset, cdet.Confidence);
|
||||
/// </code>
|
||||
/// </example>
|
||||
///
|
||||
/// or by a byte a array:
|
||||
///
|
||||
/// <example>
|
||||
/// <code>
|
||||
/// byte[] buff = new byte[1024];
|
||||
/// int read;
|
||||
/// while ((read = stream.Read(buff, 0, buff.Length)) > 0 && !done)
|
||||
/// Feed(buff, 0, read);
|
||||
/// cdet.DataEnd();
|
||||
/// Console.WriteLine("{0}, {1}", cdet.Charset, cdet.Confidence);
|
||||
/// </code>
|
||||
/// </example>
|
||||
/// </summary>
|
||||
public class CharsetDetector : Core.UniversalDetector, ICharsetDetector
|
||||
{
|
||||
private string charset;
|
||||
|
||||
private float confidence;
|
||||
|
||||
//public event DetectorFinished Finished;
|
||||
|
||||
public CharsetDetector() : base(FILTER_ALL)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
public void Feed(Stream stream)
|
||||
{
|
||||
byte[] buff = new byte[1024];
|
||||
int read;
|
||||
while ((read = stream.Read(buff, 0, buff.Length)) > 0 && !done)
|
||||
{
|
||||
Feed(buff, 0, read);
|
||||
}
|
||||
}
|
||||
|
||||
public bool IsDone()
|
||||
{
|
||||
return done;
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
this.charset = null;
|
||||
this.confidence = 0.0f;
|
||||
base.Reset();
|
||||
}
|
||||
|
||||
public string Charset {
|
||||
get { return charset; }
|
||||
}
|
||||
|
||||
public float Confidence {
|
||||
get { return confidence; }
|
||||
}
|
||||
|
||||
protected override void Report(string charset, float confidence)
|
||||
{
|
||||
this.charset = charset;
|
||||
this.confidence = confidence;
|
||||
// if (Finished != null) {
|
||||
// Finished(charset, confidence);
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
//public delegate void DetectorFinished(string charset, float confidence);
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,106 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class Big5Prober : CharsetProber
|
||||
{
|
||||
//void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
||||
private CodingStateMachine codingSM;
|
||||
private BIG5DistributionAnalyser distributionAnalyser;
|
||||
private byte[] lastChar = new byte[2];
|
||||
|
||||
public Big5Prober()
|
||||
{
|
||||
this.codingSM = new CodingStateMachine(new BIG5SMModel());
|
||||
this.distributionAnalyser = new BIG5DistributionAnalyser();
|
||||
this.Reset();
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState = 0;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME) {
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START) {
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset) {
|
||||
lastChar[1] = buf[offset];
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
} else {
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
|
||||
if (state == ProbingState.Detecting)
|
||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
return state;
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
distributionAnalyser.Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "Big-5";
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return distributionAnalyser.GetConfidence();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,98 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Kohei TAKETA <k-tak@void.in> (Java port)
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class BitPackage
|
||||
{
|
||||
public static int INDEX_SHIFT_4BITS = 3;
|
||||
public static int INDEX_SHIFT_8BITS = 2;
|
||||
public static int INDEX_SHIFT_16BITS = 1;
|
||||
|
||||
public static int SHIFT_MASK_4BITS = 7;
|
||||
public static int SHIFT_MASK_8BITS = 3;
|
||||
public static int SHIFT_MASK_16BITS = 1;
|
||||
|
||||
public static int BIT_SHIFT_4BITS = 2;
|
||||
public static int BIT_SHIFT_8BITS = 3;
|
||||
public static int BIT_SHIFT_16BITS = 4;
|
||||
|
||||
public static int UNIT_MASK_4BITS = 0x0000000F;
|
||||
public static int UNIT_MASK_8BITS = 0x000000FF;
|
||||
public static int UNIT_MASK_16BITS = 0x0000FFFF;
|
||||
|
||||
private int indexShift;
|
||||
private int shiftMask;
|
||||
private int bitShift;
|
||||
private int unitMask;
|
||||
private int[] data;
|
||||
|
||||
public BitPackage(int indexShift, int shiftMask,
|
||||
int bitShift, int unitMask, int[] data)
|
||||
{
|
||||
this.indexShift = indexShift;
|
||||
this.shiftMask = shiftMask;
|
||||
this.bitShift = bitShift;
|
||||
this.unitMask = unitMask;
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
public static int Pack16bits(int a, int b)
|
||||
{
|
||||
return ((b << 16) | a);
|
||||
}
|
||||
|
||||
public static int Pack8bits(int a, int b, int c, int d)
|
||||
{
|
||||
return Pack16bits((b << 8) | a, (d << 8) | c);
|
||||
}
|
||||
|
||||
public static int Pack4bits(int a, int b, int c, int d,
|
||||
int e, int f, int g, int h)
|
||||
{
|
||||
return Pack8bits((b << 4) | a, (d << 4) | c,
|
||||
(f << 4) | e, (h << 4) | g);
|
||||
}
|
||||
|
||||
public int Unpack(int i)
|
||||
{
|
||||
return (data[i >> indexShift] >>
|
||||
((i & shiftMask) << bitShift)) & unitMask;
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,191 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System.IO;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public enum ProbingState {
|
||||
Detecting = 0, // no sure answer yet, but caller can ask for confidence
|
||||
FoundIt = 1, // positive answer
|
||||
NotMe = 2 // negative answer
|
||||
};
|
||||
|
||||
public abstract class CharsetProber
|
||||
{
|
||||
protected const float SHORTCUT_THRESHOLD = 0.95F;
|
||||
|
||||
protected ProbingState state;
|
||||
|
||||
// ASCII codes
|
||||
private const byte SPACE = 0x20;
|
||||
private const byte CAPITAL_A = 0x41;
|
||||
private const byte CAPITAL_Z = 0x5A;
|
||||
private const byte SMALL_A = 0x61;
|
||||
private const byte SMALL_Z = 0x7A;
|
||||
private const byte LESS_THAN = 0x3C;
|
||||
private const byte GREATER_THAN = 0x3E;
|
||||
|
||||
/// <summary>
|
||||
/// Feed data to the prober
|
||||
/// </summary>
|
||||
/// <param name="buf">a buffer</param>
|
||||
/// <param name="offset">offset into buffer</param>
|
||||
/// <param name="len">number of bytes available into buffer</param>
|
||||
/// <returns>
|
||||
/// A <see cref="ProbingState"/>
|
||||
/// </returns>
|
||||
public abstract ProbingState HandleData(byte[] buf, int offset, int len);
|
||||
|
||||
/// <summary>
|
||||
/// Reset prober state
|
||||
/// </summary>
|
||||
public abstract void Reset();
|
||||
|
||||
public abstract string GetCharsetName();
|
||||
|
||||
public abstract float GetConfidence();
|
||||
|
||||
public virtual ProbingState GetState()
|
||||
{
|
||||
return state;
|
||||
}
|
||||
|
||||
public virtual void SetOption()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
public virtual void DumpStatus()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// Helper functions used in the Latin1 and Group probers
|
||||
//
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
/// <returns>filtered buffer</returns>
|
||||
protected static byte[] FilterWithoutEnglishLetters(byte[] buf, int offset, int len)
|
||||
{
|
||||
byte[] result = null;
|
||||
|
||||
using (MemoryStream ms = new MemoryStream(buf.Length)) {
|
||||
|
||||
bool meetMSB = false;
|
||||
int max = offset + len;
|
||||
int prev = offset;
|
||||
int cur = offset;
|
||||
|
||||
while (cur < max) {
|
||||
byte b = buf[cur];
|
||||
|
||||
if ((b & 0x80) != 0) {
|
||||
meetMSB = true;
|
||||
} else if (b < CAPITAL_A || (b > CAPITAL_Z && b < SMALL_A)
|
||||
|| b > SMALL_Z) {
|
||||
if (meetMSB && cur > prev) {
|
||||
ms.Write(buf, prev, cur - prev);
|
||||
ms.WriteByte(SPACE);
|
||||
meetMSB = false;
|
||||
}
|
||||
prev = cur + 1;
|
||||
}
|
||||
cur++;
|
||||
}
|
||||
|
||||
if (meetMSB && cur > prev)
|
||||
ms.Write(buf, prev, cur - prev);
|
||||
ms.SetLength(ms.Position);
|
||||
result = ms.ToArray();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Do filtering to reduce load to probers (Remove ASCII symbols,
|
||||
/// collapse spaces). This filter applies to all scripts which contain
|
||||
/// both English characters and upper ASCII characters.
|
||||
/// </summary>
|
||||
/// <returns>a filtered copy of the input buffer</returns>
|
||||
protected static byte[] FilterWithEnglishLetters(byte[] buf, int offset, int len)
|
||||
{
|
||||
byte[] result = null;
|
||||
|
||||
using (MemoryStream ms = new MemoryStream(buf.Length)) {
|
||||
|
||||
bool inTag = false;
|
||||
int max = offset + len;
|
||||
int prev = offset;
|
||||
int cur = offset;
|
||||
|
||||
while (cur < max) {
|
||||
|
||||
byte b = buf[cur];
|
||||
|
||||
if (b == GREATER_THAN)
|
||||
inTag = false;
|
||||
else if (b == LESS_THAN)
|
||||
inTag = true;
|
||||
|
||||
// it's ascii, but it's not a letter
|
||||
if ((b & 0x80) == 0 && (b < CAPITAL_A || b > SMALL_Z
|
||||
|| (b > CAPITAL_Z && b < SMALL_A))) {
|
||||
if (cur > prev && !inTag) {
|
||||
ms.Write(buf, prev, cur - prev);
|
||||
ms.WriteByte(SPACE);
|
||||
}
|
||||
prev = cur + 1;
|
||||
}
|
||||
cur++;
|
||||
}
|
||||
|
||||
// If the current segment contains more than just a symbol
|
||||
// and it is not inside a tag then keep it.
|
||||
if (!inTag && cur > prev)
|
||||
ms.Write(buf, prev, cur - prev);
|
||||
ms.SetLength(ms.Position);
|
||||
result = ms.ToArray();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,149 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is mozilla.org code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public static class Charsets
|
||||
{
|
||||
public const string ASCII = "ASCII";
|
||||
|
||||
public const string UTF8 = "UTF-8";
|
||||
|
||||
public const string UTF16_LE = "UTF-16LE";
|
||||
|
||||
public const string UTF16_BE = "UTF-16BE";
|
||||
|
||||
public const string UTF32_BE = "UTF-32BE";
|
||||
|
||||
public const string UTF32_LE = "UTF-32LE";
|
||||
|
||||
/// <summary>
|
||||
/// Unusual BOM (3412 order)
|
||||
/// </summary>
|
||||
public const string UCS4_3412 = "X-ISO-10646-UCS-4-3412";
|
||||
|
||||
/// <summary>
|
||||
/// Unusual BOM (2413 order)
|
||||
/// </summary>
|
||||
public const string UCS4_2413 = "X-ISO-10646-UCS-4-2413";
|
||||
|
||||
/// <summary>
|
||||
/// Cyrillic (based on bulgarian and russian data)
|
||||
/// </summary>
|
||||
public const string WIN1251 = "windows-1251";
|
||||
|
||||
/// <summary>
|
||||
/// Latin-1, almost identical to ISO-8859-1
|
||||
/// </summary>
|
||||
public const string WIN1252 = "windows-1252";
|
||||
|
||||
/// <summary>
|
||||
/// Greek
|
||||
/// </summary>
|
||||
public const string WIN1253 = "windows-1253";
|
||||
|
||||
/// <summary>
|
||||
/// Logical hebrew (includes ISO-8859-8-I and most of x-mac-hebrew)
|
||||
/// </summary>
|
||||
public const string WIN1255 = "windows-1255";
|
||||
|
||||
/// <summary>
|
||||
/// Traditional chinese
|
||||
/// </summary>
|
||||
public const string BIG5 = "Big-5";
|
||||
|
||||
public const string EUCKR = "EUC-KR";
|
||||
|
||||
public const string EUCJP = "EUC-JP";
|
||||
|
||||
public const string EUCTW = "EUC-TW";
|
||||
|
||||
/// <summary>
|
||||
/// Note: gb2312 is a subset of gb18030
|
||||
/// </summary>
|
||||
public const string GB18030 = "gb18030";
|
||||
|
||||
public const string ISO2022_JP = "ISO-2022-JP";
|
||||
|
||||
public const string ISO2022_CN = "ISO-2022-CN";
|
||||
|
||||
public const string ISO2022_KR = "ISO-2022-KR";
|
||||
|
||||
/// <summary>
|
||||
/// Simplified chinese
|
||||
/// </summary>
|
||||
public const string HZ_GB_2312 = "HZ-GB-2312";
|
||||
|
||||
public const string SHIFT_JIS = "Shift-JIS";
|
||||
|
||||
public const string MAC_CYRILLIC = "x-mac-cyrillic";
|
||||
|
||||
public const string KOI8R = "KOI8-R";
|
||||
|
||||
public const string IBM855 = "IBM855";
|
||||
|
||||
public const string IBM866 = "IBM866";
|
||||
|
||||
/// <summary>
|
||||
/// East-Europe. Disabled because too similar to windows-1252
|
||||
/// (latin-1). Should use tri-grams models to discriminate between
|
||||
/// these two charsets.
|
||||
/// </summary>
|
||||
public const string ISO8859_2 = "ISO-8859-2";
|
||||
|
||||
/// <summary>
|
||||
/// Cyrillic
|
||||
/// </summary>
|
||||
public const string ISO8859_5 = "ISO-8859-5";
|
||||
|
||||
/// <summary>
|
||||
/// Greek
|
||||
/// </summary>
|
||||
public const string ISO_8859_7 = "ISO-8859-7";
|
||||
|
||||
/// <summary>
|
||||
/// Visual Hebrew
|
||||
/// </summary>
|
||||
public const string ISO8859_8 = "ISO-8859-8";
|
||||
|
||||
/// <summary>
|
||||
/// Thai. This recognizer is not enabled yet.
|
||||
/// </summary>
|
||||
public const string TIS620 = "TIS620";
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,90 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is mozilla.org code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Kohei TAKETA <k-tak@void.in> (Java port)
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
/// <summary>
|
||||
/// Parallel state machine for the Coding Scheme Method
|
||||
/// </summary>
|
||||
public class CodingStateMachine
|
||||
{
|
||||
private int currentState;
|
||||
private SMModel model;
|
||||
private int currentCharLen;
|
||||
private int currentBytePos;
|
||||
|
||||
public CodingStateMachine(SMModel model)
|
||||
{
|
||||
this.currentState = SMModel.START;
|
||||
this.model = model;
|
||||
}
|
||||
|
||||
public int NextState(byte b)
|
||||
{
|
||||
// for each byte we get its class, if it is first byte,
|
||||
// we also get byte length
|
||||
int byteCls = model.GetClass(b);
|
||||
if (currentState == SMModel.START) {
|
||||
currentBytePos = 0;
|
||||
currentCharLen = model.charLenTable[byteCls];
|
||||
}
|
||||
|
||||
// from byte's class and stateTable, we get its next state
|
||||
currentState = model.stateTable.Unpack(
|
||||
currentState * model.ClassFactor + byteCls);
|
||||
currentBytePos++;
|
||||
return currentState;
|
||||
}
|
||||
|
||||
public void Reset()
|
||||
{
|
||||
currentState = SMModel.START;
|
||||
}
|
||||
|
||||
public int CurrentCharLen
|
||||
{
|
||||
get { return currentCharLen; }
|
||||
}
|
||||
|
||||
public string ModelName
|
||||
{
|
||||
get { return model.Name; }
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,110 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class EUCJPProber : CharsetProber
|
||||
{
|
||||
private CodingStateMachine codingSM;
|
||||
private EUCJPContextAnalyser contextAnalyser;
|
||||
private EUCJPDistributionAnalyser distributionAnalyser;
|
||||
private byte[] lastChar = new byte[2];
|
||||
|
||||
public EUCJPProber()
|
||||
{
|
||||
codingSM = new CodingStateMachine(new EUCJPSMModel());
|
||||
distributionAnalyser = new EUCJPDistributionAnalyser();
|
||||
contextAnalyser = new EUCJPContextAnalyser();
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "EUC-JP";
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME) {
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START) {
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset) {
|
||||
lastChar[1] = buf[offset];
|
||||
contextAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
} else {
|
||||
contextAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
if (state == ProbingState.Detecting)
|
||||
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
return state;
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
contextAnalyser.Reset();
|
||||
distributionAnalyser.Reset();
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
float contxtCf = contextAnalyser.GetConfidence();
|
||||
float distribCf = distributionAnalyser.GetConfidence();
|
||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,107 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class EUCKRProber : CharsetProber
|
||||
{
|
||||
private CodingStateMachine codingSM;
|
||||
private EUCKRDistributionAnalyser distributionAnalyser;
|
||||
private byte[] lastChar = new byte[2];
|
||||
|
||||
public EUCKRProber()
|
||||
{
|
||||
codingSM = new CodingStateMachine(new EUCKRSMModel());
|
||||
distributionAnalyser = new EUCKRDistributionAnalyser();
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "EUC-KR";
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME) {
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START) {
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset) {
|
||||
lastChar[1] = buf[offset];
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
} else {
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
|
||||
if (state == ProbingState.Detecting)
|
||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
return state;
|
||||
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return distributionAnalyser.GetConfidence();
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
distributionAnalyser.Reset();
|
||||
//mContextAnalyser.Reset();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,106 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class EUCTWProber : CharsetProber
|
||||
{
|
||||
private CodingStateMachine codingSM;
|
||||
private EUCTWDistributionAnalyser distributionAnalyser;
|
||||
private byte[] lastChar = new byte[2];
|
||||
|
||||
public EUCTWProber()
|
||||
{
|
||||
this.codingSM = new CodingStateMachine(new EUCTWSMModel());
|
||||
this.distributionAnalyser = new EUCTWDistributionAnalyser();
|
||||
this.Reset();
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = 0; i < max; i++) {
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME) {
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START) {
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset) {
|
||||
lastChar[1] = buf[offset];
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
} else {
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
|
||||
if (state == ProbingState.Detecting)
|
||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
return state;
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "x-euc-tw";
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
distributionAnalyser.Reset();
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return distributionAnalyser.GetConfidence();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,105 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class EscCharsetProber : CharsetProber
|
||||
{
|
||||
private const int CHARSETS_NUM = 4;
|
||||
private string detectedCharset;
|
||||
private CodingStateMachine[] codingSM;
|
||||
int activeSM;
|
||||
|
||||
public EscCharsetProber()
|
||||
{
|
||||
codingSM = new CodingStateMachine[CHARSETS_NUM];
|
||||
codingSM[0] = new CodingStateMachine(new HZSMModel());
|
||||
codingSM[1] = new CodingStateMachine(new ISO2022CNSMModel());
|
||||
codingSM[2] = new CodingStateMachine(new ISO2022JPSMModel());
|
||||
codingSM[3] = new CodingStateMachine(new ISO2022KRSMModel());
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
state = ProbingState.Detecting;
|
||||
for (int i = 0; i < CHARSETS_NUM; i++)
|
||||
codingSM[i].Reset();
|
||||
activeSM = CHARSETS_NUM;
|
||||
detectedCharset = null;
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max && state == ProbingState.Detecting; i++) {
|
||||
for (int j = activeSM - 1; j >= 0; j--) {
|
||||
// byte is feed to all active state machine
|
||||
int codingState = codingSM[j].NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
// got negative answer for this state machine, make it inactive
|
||||
activeSM--;
|
||||
if (activeSM == 0) {
|
||||
state = ProbingState.NotMe;
|
||||
return state;
|
||||
} else if (j != activeSM) {
|
||||
CodingStateMachine t = codingSM[activeSM];
|
||||
codingSM[activeSM] = codingSM[j];
|
||||
codingSM[j] = t;
|
||||
}
|
||||
} else if (codingState == SMModel.ITSME) {
|
||||
state = ProbingState.FoundIt;
|
||||
detectedCharset = codingSM[j].ModelName;
|
||||
return state;
|
||||
}
|
||||
}
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return detectedCharset;
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return 0.99f;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,304 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is mozilla.org code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Kohei TAKETA <k-tak@void.in> (Java port)
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
/// <summary>
|
||||
/// Escaped charsets state machines
|
||||
/// </summary>
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class HZSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] HZ_cls = {
|
||||
BitPackage.Pack4bits(1,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,4,0,5,2,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 80 - 87
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 88 - 8f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 90 - 97
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 98 - 9f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // a0 - a7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // a8 - af
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b0 - b7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b8 - bf
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // c0 - c7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // c8 - cf
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // d0 - d7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // d8 - df
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // e0 - e7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // e8 - ef
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // f0 - f7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] HZ_st = {
|
||||
BitPackage.Pack4bits(START, ERROR, 3, START, START, START, ERROR, ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR, ERROR, ITSME, ITSME, ITSME, ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME, ITSME, ERROR, ERROR, START, START, 4, ERROR),//10-17
|
||||
BitPackage.Pack4bits( 5, ERROR, 6, ERROR, 5, 5, 4, ERROR),//18-1f
|
||||
BitPackage.Pack4bits( 4, ERROR, 4, 4, 4, ERROR, 4, ERROR),//20-27
|
||||
BitPackage.Pack4bits( 4, ITSME, START, START, START, START, START, START) //28-2f
|
||||
};
|
||||
|
||||
private readonly static int[] HZCharLenTable = {0, 0, 0, 0, 0, 0};
|
||||
|
||||
public HZSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, HZ_cls),
|
||||
6,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, HZ_st),
|
||||
HZCharLenTable, "HZ-GB-2312")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public class ISO2022CNSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] ISO2022CN_cls = {
|
||||
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,3,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,4,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022CN_st = {
|
||||
BitPackage.Pack4bits(START, 3,ERROR,START,START,START,START,START),//00-07
|
||||
BitPackage.Pack4bits(START,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//08-0f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME),//10-17
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//20-27
|
||||
BitPackage.Pack4bits( 5, 6,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//28-2f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//30-37
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022CNCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
public ISO2022CNSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022CN_cls),
|
||||
9,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022CN_st),
|
||||
ISO2022CNCharLenTable, "ISO-2022-CN")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public class ISO2022JPSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] ISO2022JP_cls = {
|
||||
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,2,2), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,7,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(3,0,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(6,0,4,0,8,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,9,5,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022JP_st = {
|
||||
BitPackage.Pack4bits(START, 3, ERROR,START,START,START,START,START),//00-07
|
||||
BitPackage.Pack4bits(START, START, ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//08-0f
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//10-17
|
||||
BitPackage.Pack4bits(ITSME, ITSME, ITSME,ITSME,ITSME,ITSME,ERROR,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ERROR, 5, ERROR,ERROR,ERROR, 4,ERROR,ERROR),//20-27
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR, 6,ITSME,ERROR,ITSME,ERROR),//28-2f
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//30-37
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//38-3f
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022JPCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
public ISO2022JPSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022JP_cls),
|
||||
10,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022JP_st),
|
||||
ISO2022JPCharLenTable, "ISO-2022-JP")
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class ISO2022KRSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] ISO2022KR_cls = {
|
||||
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,3,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,4,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,5,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022KR_st = {
|
||||
BitPackage.Pack4bits(START, 3,ERROR,START,START,START,ERROR,ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR,ERROR),//10-17
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR, 5,ERROR,ERROR,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022KRCharLenTable = {0, 0, 0, 0, 0, 0};
|
||||
|
||||
public ISO2022KRSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022KR_cls),
|
||||
6,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022KR_st),
|
||||
ISO2022KRCharLenTable, "ISO-2022-KR")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,111 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
// We use gb18030 to replace gb2312, because 18030 is a superset.
|
||||
public class GB18030Prober : CharsetProber
|
||||
{
|
||||
private CodingStateMachine codingSM;
|
||||
private GB18030DistributionAnalyser analyser;
|
||||
private byte[] lastChar;
|
||||
|
||||
public GB18030Prober()
|
||||
{
|
||||
lastChar = new byte[2];
|
||||
codingSM = new CodingStateMachine(new GB18030SMModel());
|
||||
analyser = new GB18030DistributionAnalyser();
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "gb18030";
|
||||
}
|
||||
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState = SMModel.START;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME) {
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START) {
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset) {
|
||||
lastChar[1] = buf[offset];
|
||||
analyser.HandleOneChar(lastChar, 0, charLen);
|
||||
} else {
|
||||
analyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lastChar[0] = buf[max-1];
|
||||
|
||||
if (state == ProbingState.Detecting) {
|
||||
if (analyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
}
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return analyser.GetConfidence();
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
analyser.Reset();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,324 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
/**
|
||||
* General ideas of the Hebrew charset recognition
|
||||
*
|
||||
* Four main charsets exist in Hebrew:
|
||||
* "ISO-8859-8" - Visual Hebrew
|
||||
* "windows-1255" - Logical Hebrew
|
||||
* "ISO-8859-8-I" - Logical Hebrew
|
||||
* "x-mac-hebrew" - ?? Logical Hebrew ??
|
||||
*
|
||||
* Both "ISO" charsets use a completely identical set of code points, whereas
|
||||
* "windows-1255" and "x-mac-hebrew" are two different proper supersets of
|
||||
* these code points. windows-1255 defines additional characters in the range
|
||||
* 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
|
||||
* diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
|
||||
* x-mac-hebrew defines similar additional code points but with a different
|
||||
* mapping.
|
||||
*
|
||||
* As far as an average Hebrew text with no diacritics is concerned, all four
|
||||
* charsets are identical with respect to code points. Meaning that for the
|
||||
* main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
|
||||
* (including final letters).
|
||||
*
|
||||
* The dominant difference between these charsets is their directionality.
|
||||
* "Visual" directionality means that the text is ordered as if the renderer is
|
||||
* not aware of a BIDI rendering algorithm. The renderer sees the text and
|
||||
* draws it from left to right. The text itself when ordered naturally is read
|
||||
* backwards. A buffer of Visual Hebrew generally looks like so:
|
||||
* "[last word of first line spelled backwards] [whole line ordered backwards
|
||||
* and spelled backwards] [first word of first line spelled backwards]
|
||||
* [end of line] [last word of second line] ... etc' "
|
||||
* adding punctuation marks, numbers and English text to visual text is
|
||||
* naturally also "visual" and from left to right.
|
||||
*
|
||||
* "Logical" directionality means the text is ordered "naturally" according to
|
||||
* the order it is read. It is the responsibility of the renderer to display
|
||||
* the text from right to left. A BIDI algorithm is used to place general
|
||||
* punctuation marks, numbers and English text in the text.
|
||||
*
|
||||
* Texts in x-mac-hebrew are almost impossible to find on the Internet. From
|
||||
* what little evidence I could find, it seems that its general directionality
|
||||
* is Logical.
|
||||
*
|
||||
* To sum up all of the above, the Hebrew probing mechanism knows about two
|
||||
* charsets:
|
||||
* Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
|
||||
* backwards while line order is natural. For charset recognition purposes
|
||||
* the line order is unimportant (In fact, for this implementation, even
|
||||
* word order is unimportant).
|
||||
* Logical Hebrew - "windows-1255" - normal, naturally ordered text.
|
||||
*
|
||||
* "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
|
||||
* specifically identified.
|
||||
* "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
|
||||
* that contain special punctuation marks or diacritics is displayed with
|
||||
* some unconverted characters showing as question marks. This problem might
|
||||
* be corrected using another model prober for x-mac-hebrew. Due to the fact
|
||||
* that x-mac-hebrew texts are so rare, writing another model prober isn't
|
||||
* worth the effort and performance hit.
|
||||
*
|
||||
* *** The Prober ***
|
||||
*
|
||||
* The prober is divided between two nsSBCharSetProbers and an nsHebrewProber,
|
||||
* all of which are managed, created, fed data, inquired and deleted by the
|
||||
* nsSBCSGroupProber. The two nsSBCharSetProbers identify that the text is in
|
||||
* fact some kind of Hebrew, Logical or Visual. The final decision about which
|
||||
* one is it is made by the nsHebrewProber by combining final-letter scores
|
||||
* with the scores of the two nsSBCharSetProbers to produce a final answer.
|
||||
*
|
||||
* The nsSBCSGroupProber is responsible for stripping the original text of HTML
|
||||
* tags, English characters, numbers, low-ASCII punctuation characters, spaces
|
||||
* and new lines. It reduces any sequence of such characters to a single space.
|
||||
* The buffer fed to each prober in the SBCS group prober is pure text in
|
||||
* high-ASCII.
|
||||
* The two nsSBCharSetProbers (model probers) share the same language model:
|
||||
* Win1255Model.
|
||||
* The first nsSBCharSetProber uses the model normally as any other
|
||||
* nsSBCharSetProber does, to recognize windows-1255, upon which this model was
|
||||
* built. The second nsSBCharSetProber is told to make the pair-of-letter
|
||||
* lookup in the language model backwards. This in practice exactly simulates
|
||||
* a visual Hebrew model using the windows-1255 logical Hebrew model.
|
||||
*
|
||||
* The nsHebrewProber is not using any language model. All it does is look for
|
||||
* final-letter evidence suggesting the text is either logical Hebrew or visual
|
||||
* Hebrew. Disjointed from the model probers, the results of the nsHebrewProber
|
||||
* alone are meaningless. nsHebrewProber always returns 0.00 as confidence
|
||||
* since it never identifies a charset by itself. Instead, the pointer to the
|
||||
* nsHebrewProber is passed to the model probers as a helper "Name Prober".
|
||||
* When the Group prober receives a positive identification from any prober,
|
||||
* it asks for the name of the charset identified. If the prober queried is a
|
||||
* Hebrew model prober, the model prober forwards the call to the
|
||||
* nsHebrewProber to make the final decision. In the nsHebrewProber, the
|
||||
* decision is made according to the final-letters scores maintained and Both
|
||||
* model probers scores. The answer is returned in the form of the name of the
|
||||
* charset identified, either "windows-1255" or "ISO-8859-8".
|
||||
*
|
||||
*/
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
|
||||
/// <summary>
|
||||
/// This prober doesn't actually recognize a language or a charset.
|
||||
/// It is a helper prober for the use of the Hebrew model probers
|
||||
/// </summary>
|
||||
public class HebrewProber : CharsetProber
|
||||
{
|
||||
// windows-1255 / ISO-8859-8 code points of interest
|
||||
private const byte FINAL_KAF = 0xEA;
|
||||
private const byte NORMAL_KAF = 0xEB;
|
||||
private const byte FINAL_MEM = 0xED;
|
||||
private const byte NORMAL_MEM = 0xEE;
|
||||
private const byte FINAL_NUN = 0xEF;
|
||||
private const byte NORMAL_NUN = 0xF0;
|
||||
private const byte FINAL_PE = 0xF3;
|
||||
private const byte NORMAL_PE = 0xF4;
|
||||
private const byte FINAL_TSADI = 0xF5;
|
||||
private const byte NORMAL_TSADI = 0xF6;
|
||||
|
||||
// Minimum Visual vs Logical final letter score difference.
|
||||
// If the difference is below this, don't rely solely on the final letter score distance.
|
||||
private const int MIN_FINAL_CHAR_DISTANCE = 5;
|
||||
|
||||
// Minimum Visual vs Logical model score difference.
|
||||
// If the difference is below this, don't rely at all on the model score distance.
|
||||
private const float MIN_MODEL_DISTANCE = 0.01f;
|
||||
|
||||
protected const string VISUAL_HEBREW_NAME = "ISO-8859-8";
|
||||
protected const string LOGICAL_HEBREW_NAME = "windows-1255";
|
||||
|
||||
// owned by the group prober.
|
||||
protected CharsetProber logicalProber, visualProber;
|
||||
protected int finalCharLogicalScore, finalCharVisualScore;
|
||||
|
||||
// The two last bytes seen in the previous buffer.
|
||||
protected byte prev, beforePrev;
|
||||
|
||||
public HebrewProber()
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
|
||||
public void SetModelProbers(CharsetProber logical, CharsetProber visual)
|
||||
{
|
||||
logicalProber = logical;
|
||||
visualProber = visual;
|
||||
}
|
||||
|
||||
/**
|
||||
* Final letter analysis for logical-visual decision.
|
||||
* Look for evidence that the received buffer is either logical Hebrew or
|
||||
* visual Hebrew.
|
||||
* The following cases are checked:
|
||||
* 1) A word longer than 1 letter, ending with a final letter. This is an
|
||||
* indication that the text is laid out "naturally" since the final letter
|
||||
* really appears at the end. +1 for logical score.
|
||||
* 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
|
||||
* Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
|
||||
* the Non-Final form of that letter. Exceptions to this rule are mentioned
|
||||
* above in isNonFinal(). This is an indication that the text is laid out
|
||||
* backwards. +1 for visual score
|
||||
* 3) A word longer than 1 letter, starting with a final letter. Final letters
|
||||
* should not appear at the beginning of a word. This is an indication that
|
||||
* the text is laid out backwards. +1 for visual score.
|
||||
*
|
||||
* The visual score and logical score are accumulated throughout the text and
|
||||
* are finally checked against each other in GetCharSetName().
|
||||
* No checking for final letters in the middle of words is done since that case
|
||||
* is not an indication for either Logical or Visual text.
|
||||
*
|
||||
* The input buffer should not contain any white spaces that are not (' ')
|
||||
* or any low-ascii punctuation marks.
|
||||
*/
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
// Both model probers say it's not them. No reason to continue.
|
||||
if (GetState() == ProbingState.NotMe)
|
||||
return ProbingState.NotMe;
|
||||
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
|
||||
byte b = buf[i];
|
||||
|
||||
// a word just ended
|
||||
if (b == 0x20) {
|
||||
// *(curPtr-2) was not a space so prev is not a 1 letter word
|
||||
if (beforePrev != 0x20) {
|
||||
// case (1) [-2:not space][-1:final letter][cur:space]
|
||||
if (IsFinal(prev))
|
||||
finalCharLogicalScore++;
|
||||
// case (2) [-2:not space][-1:Non-Final letter][cur:space]
|
||||
else if (IsNonFinal(prev))
|
||||
finalCharVisualScore++;
|
||||
}
|
||||
|
||||
} else {
|
||||
// case (3) [-2:space][-1:final letter][cur:not space]
|
||||
if ((beforePrev == 0x20) && (IsFinal(prev)) && (b != ' '))
|
||||
++finalCharVisualScore;
|
||||
}
|
||||
beforePrev = prev;
|
||||
prev = b;
|
||||
}
|
||||
|
||||
// Forever detecting, till the end or until both model probers
|
||||
// return NotMe (handled above).
|
||||
return ProbingState.Detecting;
|
||||
}
|
||||
|
||||
// Make the decision: is it Logical or Visual?
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
// If the final letter score distance is dominant enough, rely on it.
|
||||
int finalsub = finalCharLogicalScore - finalCharVisualScore;
|
||||
if (finalsub >= MIN_FINAL_CHAR_DISTANCE)
|
||||
return LOGICAL_HEBREW_NAME;
|
||||
if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE))
|
||||
return VISUAL_HEBREW_NAME;
|
||||
|
||||
// It's not dominant enough, try to rely on the model scores instead.
|
||||
float modelsub = logicalProber.GetConfidence() - visualProber.GetConfidence();
|
||||
if (modelsub > MIN_MODEL_DISTANCE)
|
||||
return LOGICAL_HEBREW_NAME;
|
||||
if (modelsub < -(MIN_MODEL_DISTANCE))
|
||||
return VISUAL_HEBREW_NAME;
|
||||
|
||||
// Still no good, back to final letter distance, maybe it'll save the day.
|
||||
if (finalsub < 0)
|
||||
return VISUAL_HEBREW_NAME;
|
||||
|
||||
// (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
|
||||
return LOGICAL_HEBREW_NAME;
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
finalCharLogicalScore = 0;
|
||||
finalCharVisualScore = 0;
|
||||
prev = 0x20;
|
||||
beforePrev = 0x20;
|
||||
}
|
||||
|
||||
public override ProbingState GetState()
|
||||
{
|
||||
// Remain active as long as any of the model probers are active.
|
||||
if (logicalProber.GetState() == ProbingState.NotMe &&
|
||||
visualProber.GetState() == ProbingState.NotMe)
|
||||
return ProbingState.NotMe;
|
||||
return ProbingState.Detecting;
|
||||
}
|
||||
|
||||
public override void DumpStatus()
|
||||
{
|
||||
//Console.WriteLine(" HEB: {0} - {1} [Logical-Visual score]", finalCharLogicalScore, finalCharVisualScore);
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
protected static bool IsFinal(byte b)
|
||||
{
|
||||
return (b == FINAL_KAF || b == FINAL_MEM || b == FINAL_NUN
|
||||
|| b == FINAL_PE || b == FINAL_TSADI);
|
||||
}
|
||||
|
||||
protected static bool IsNonFinal(byte b)
|
||||
{
|
||||
// The normal Tsadi is not a good Non-Final letter due to words like
|
||||
// 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
||||
// apostrophe is converted to a space in FilterWithoutEnglishLetters causing
|
||||
// the Non-Final tsadi to appear at an end of a word even though this is not
|
||||
// the case in the original text.
|
||||
// The letters Pe and Kaf rarely display a related behavior of not being a
|
||||
// good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
|
||||
// example legally end with a Non-Final Pe or Kaf. However, the benefit of
|
||||
// these letters as Non-Final letters outweighs the damage since these words
|
||||
// are quite rare.
|
||||
return (b == NORMAL_KAF || b == NORMAL_MEM || b == NORMAL_NUN
|
||||
|| b == NORMAL_PE);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,315 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class JapaneseContextAnalyser
|
||||
{
|
||||
protected const int CATEGORIES_NUM = 6;
|
||||
protected const int ENOUGH_REL_THRESHOLD = 100;
|
||||
protected const int MAX_REL_THRESHOLD = 1000;
|
||||
protected const int MINIMUM_DATA_THRESHOLD = 4;
|
||||
protected const float DONT_KNOW = -1.0f;
|
||||
|
||||
// hiragana frequency category table
|
||||
// This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||
protected static byte[,] jp2CharContext = {
|
||||
{ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
|
||||
{ 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
|
||||
{ 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,},
|
||||
{ 0,4,0,5,0,5,0,4,0,4,5,4,4,3,5,3,5,1,5,3,4,3,4,4,3,4,3,3,4,3,5,4,4,3,5,5,3,5,5,5,3,5,5,3,4,5,5,3,1,3,2,0,3,4,0,4,2,0,4,2,1,5,3,2,3,5,0,4,0,2,0,5,4,4,5,4,5,0,4,0,0,4,4,},
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
|
||||
{ 0,3,0,4,0,3,0,3,0,4,5,4,3,3,3,3,4,3,5,4,4,3,5,4,4,3,4,3,4,4,4,4,5,3,4,4,3,4,5,5,4,5,5,1,4,5,4,3,0,3,3,1,3,3,0,4,4,0,3,3,1,5,3,3,3,5,0,4,0,3,0,4,4,3,4,3,3,0,4,1,1,3,4,},
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
|
||||
{ 0,4,0,3,0,3,0,4,0,3,4,4,3,2,2,1,2,1,3,1,3,3,3,3,3,4,3,1,3,3,5,3,3,0,4,3,0,5,4,3,3,5,4,4,3,4,4,5,0,1,2,0,1,2,0,2,2,0,1,0,0,5,2,2,1,4,0,3,0,1,0,4,4,3,5,4,3,0,2,1,0,4,3,},
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
|
||||
{ 0,3,0,5,0,4,0,2,1,4,4,2,4,1,4,2,4,2,4,3,3,3,4,3,3,3,3,1,4,2,3,3,3,1,4,4,1,1,1,4,3,3,2,0,2,4,3,2,0,3,3,0,3,1,1,0,0,0,3,3,0,4,2,2,3,4,0,4,0,3,0,4,4,5,3,4,4,0,3,0,0,1,4,},
|
||||
{ 1,4,0,4,0,4,0,4,0,3,5,4,4,3,4,3,5,4,3,3,4,3,5,4,4,4,4,3,4,2,4,3,3,1,5,4,3,2,4,5,4,5,5,4,4,5,4,4,0,3,2,2,3,3,0,4,3,1,3,2,1,4,3,3,4,5,0,3,0,2,0,4,5,5,4,5,4,0,4,0,0,5,4,},
|
||||
{ 0,5,0,5,0,4,0,3,0,4,4,3,4,3,3,3,4,0,4,4,4,3,4,3,4,3,3,1,4,2,4,3,4,0,5,4,1,4,5,4,4,5,3,2,4,3,4,3,2,4,1,3,3,3,2,3,2,0,4,3,3,4,3,3,3,4,0,4,0,3,0,4,5,4,4,4,3,0,4,1,0,1,3,},
|
||||
{ 0,3,1,4,0,3,0,2,0,3,4,4,3,1,4,2,3,3,4,3,4,3,4,3,4,4,3,2,3,1,5,4,4,1,4,4,3,5,4,4,3,5,5,4,3,4,4,3,1,2,3,1,2,2,0,3,2,0,3,1,0,5,3,3,3,4,3,3,3,3,4,4,4,4,5,4,2,0,3,3,2,4,3,},
|
||||
{ 0,2,0,3,0,1,0,1,0,0,3,2,0,0,2,0,1,0,2,1,3,3,3,1,2,3,1,0,1,0,4,2,1,1,3,3,0,4,3,3,1,4,3,3,0,3,3,2,0,0,0,0,1,0,0,2,0,0,0,0,0,4,1,0,2,3,2,2,2,1,3,3,3,4,4,3,2,0,3,1,0,3,3,},
|
||||
{ 0,4,0,4,0,3,0,3,0,4,4,4,3,3,3,3,3,3,4,3,4,2,4,3,4,3,3,2,4,3,4,5,4,1,4,5,3,5,4,5,3,5,4,0,3,5,5,3,1,3,3,2,2,3,0,3,4,1,3,3,2,4,3,3,3,4,0,4,0,3,0,4,5,4,4,5,3,0,4,1,0,3,4,},
|
||||
{ 0,2,0,3,0,3,0,0,0,2,2,2,1,0,1,0,0,0,3,0,3,0,3,0,1,3,1,0,3,1,3,3,3,1,3,3,3,0,1,3,1,3,4,0,0,3,1,1,0,3,2,0,0,0,0,1,3,0,1,0,0,3,3,2,0,3,0,0,0,0,0,3,4,3,4,3,3,0,3,0,0,2,3,},
|
||||
{ 2,3,0,3,0,2,0,1,0,3,3,4,3,1,3,1,1,1,3,1,4,3,4,3,3,3,0,0,3,1,5,4,3,1,4,3,2,5,5,4,4,4,4,3,3,4,4,4,0,2,1,1,3,2,0,1,2,0,0,1,0,4,1,3,3,3,0,3,0,1,0,4,4,4,5,5,3,0,2,0,0,4,4,},
|
||||
{ 0,2,0,1,0,3,1,3,0,2,3,3,3,0,3,1,0,0,3,0,3,2,3,1,3,2,1,1,0,0,4,2,1,0,2,3,1,4,3,2,0,4,4,3,1,3,1,3,0,1,0,0,1,0,0,0,1,0,0,0,0,4,1,1,1,2,0,3,0,0,0,3,4,2,4,3,2,0,1,0,0,3,3,},
|
||||
{ 0,1,0,4,0,5,0,4,0,2,4,4,2,3,3,2,3,3,5,3,3,3,4,3,4,2,3,0,4,3,3,3,4,1,4,3,2,1,5,5,3,4,5,1,3,5,4,2,0,3,3,0,1,3,0,4,2,0,1,3,1,4,3,3,3,3,0,3,0,1,0,3,4,4,4,5,5,0,3,0,1,4,5,},
|
||||
{ 0,2,0,3,0,3,0,0,0,2,3,1,3,0,4,0,1,1,3,0,3,4,3,2,3,1,0,3,3,2,3,1,3,0,2,3,0,2,1,4,1,2,2,0,0,3,3,0,0,2,0,0,0,1,0,0,0,0,2,2,0,3,2,1,3,3,0,2,0,2,0,0,3,3,1,2,4,0,3,0,2,2,3,},
|
||||
{ 2,4,0,5,0,4,0,4,0,2,4,4,4,3,4,3,3,3,1,2,4,3,4,3,4,4,5,0,3,3,3,3,2,0,4,3,1,4,3,4,1,4,4,3,3,4,4,3,1,2,3,0,4,2,0,4,1,0,3,3,0,4,3,3,3,4,0,4,0,2,0,3,5,3,4,5,2,0,3,0,0,4,5,},
|
||||
{ 0,3,0,4,0,1,0,1,0,1,3,2,2,1,3,0,3,0,2,0,2,0,3,0,2,0,0,0,1,0,1,1,0,0,3,1,0,0,0,4,0,3,1,0,2,1,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,2,2,3,1,0,3,0,0,0,1,4,4,4,3,0,0,4,0,0,1,4,},
|
||||
{ 1,4,1,5,0,3,0,3,0,4,5,4,4,3,5,3,3,4,4,3,4,1,3,3,3,3,2,1,4,1,5,4,3,1,4,4,3,5,4,4,3,5,4,3,3,4,4,4,0,3,3,1,2,3,0,3,1,0,3,3,0,5,4,4,4,4,4,4,3,3,5,4,4,3,3,5,4,0,3,2,0,4,4,},
|
||||
{ 0,2,0,3,0,1,0,0,0,1,3,3,3,2,4,1,3,0,3,1,3,0,2,2,1,1,0,0,2,0,4,3,1,0,4,3,0,4,4,4,1,4,3,1,1,3,3,1,0,2,0,0,1,3,0,0,0,0,2,0,0,4,3,2,4,3,5,4,3,3,3,4,3,3,4,3,3,0,2,1,0,3,3,},
|
||||
{ 0,2,0,4,0,3,0,2,0,2,5,5,3,4,4,4,4,1,4,3,3,0,4,3,4,3,1,3,3,2,4,3,0,3,4,3,0,3,4,4,2,4,4,0,4,5,3,3,2,2,1,1,1,2,0,1,5,0,3,3,2,4,3,3,3,4,0,3,0,2,0,4,4,3,5,5,0,0,3,0,2,3,3,},
|
||||
{ 0,3,0,4,0,3,0,1,0,3,4,3,3,1,3,3,3,0,3,1,3,0,4,3,3,1,1,0,3,0,3,3,0,0,4,4,0,1,5,4,3,3,5,0,3,3,4,3,0,2,0,1,1,1,0,1,3,0,1,2,1,3,3,2,3,3,0,3,0,1,0,1,3,3,4,4,1,0,1,2,2,1,3,},
|
||||
{ 0,1,0,4,0,4,0,3,0,1,3,3,3,2,3,1,1,0,3,0,3,3,4,3,2,4,2,0,1,0,4,3,2,0,4,3,0,5,3,3,2,4,4,4,3,3,3,4,0,1,3,0,0,1,0,0,1,0,0,0,0,4,2,3,3,3,0,3,0,0,0,4,4,4,5,3,2,0,3,3,0,3,5,},
|
||||
{ 0,2,0,3,0,0,0,3,0,1,3,0,2,0,0,0,1,0,3,1,1,3,3,0,0,3,0,0,3,0,2,3,1,0,3,1,0,3,3,2,0,4,2,2,0,2,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,2,1,2,0,1,0,1,0,0,0,1,3,1,2,0,0,0,1,0,0,1,4,},
|
||||
{ 0,3,0,3,0,5,0,1,0,2,4,3,1,3,3,2,1,1,5,2,1,0,5,1,2,0,0,0,3,3,2,2,3,2,4,3,0,0,3,3,1,3,3,0,2,5,3,4,0,3,3,0,1,2,0,2,2,0,3,2,0,2,2,3,3,3,0,2,0,1,0,3,4,4,2,5,4,0,3,0,0,3,5,},
|
||||
{ 0,3,0,3,0,3,0,1,0,3,3,3,3,0,3,0,2,0,2,1,1,0,2,0,1,0,0,0,2,1,0,0,1,0,3,2,0,0,3,3,1,2,3,1,0,3,3,0,0,1,0,0,0,0,0,2,0,0,0,0,0,2,3,1,2,3,0,3,0,1,0,3,2,1,0,4,3,0,1,1,0,3,3,},
|
||||
{ 0,4,0,5,0,3,0,3,0,4,5,5,4,3,5,3,4,3,5,3,3,2,5,3,4,4,4,3,4,3,4,5,5,3,4,4,3,4,4,5,4,4,4,3,4,5,5,4,2,3,4,2,3,4,0,3,3,1,4,3,2,4,3,3,5,5,0,3,0,3,0,5,5,5,5,4,4,0,4,0,1,4,4,},
|
||||
{ 0,4,0,4,0,3,0,3,0,3,5,4,4,2,3,2,5,1,3,2,5,1,4,2,3,2,3,3,4,3,3,3,3,2,5,4,1,3,3,5,3,4,4,0,4,4,3,1,1,3,1,0,2,3,0,2,3,0,3,0,0,4,3,1,3,4,0,3,0,2,0,4,4,4,3,4,5,0,4,0,0,3,4,},
|
||||
{ 0,3,0,3,0,3,1,2,0,3,4,4,3,3,3,0,2,2,4,3,3,1,3,3,3,1,1,0,3,1,4,3,2,3,4,4,2,4,4,4,3,4,4,3,2,4,4,3,1,3,3,1,3,3,0,4,1,0,2,2,1,4,3,2,3,3,5,4,3,3,5,4,4,3,3,0,4,0,3,2,2,4,4,},
|
||||
{ 0,2,0,1,0,0,0,0,0,1,2,1,3,0,0,0,0,0,2,0,1,2,1,0,0,1,0,0,0,0,3,0,0,1,0,1,1,3,1,0,0,0,1,1,0,1,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,1,2,2,0,3,4,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,},
|
||||
{ 0,1,0,0,0,1,0,0,0,0,4,0,4,1,4,0,3,0,4,0,3,0,4,0,3,0,3,0,4,1,5,1,4,0,0,3,0,5,0,5,2,0,1,0,0,0,2,1,4,0,1,3,0,0,3,0,0,3,1,1,4,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,},
|
||||
{ 1,4,0,5,0,3,0,2,0,3,5,4,4,3,4,3,5,3,4,3,3,0,4,3,3,3,3,3,3,2,4,4,3,1,3,4,4,5,4,4,3,4,4,1,3,5,4,3,3,3,1,2,2,3,3,1,3,1,3,3,3,5,3,3,4,5,0,3,0,3,0,3,4,3,4,4,3,0,3,0,2,4,3,},
|
||||
{ 0,1,0,4,0,0,0,0,0,1,4,0,4,1,4,2,4,0,3,0,1,0,1,0,0,0,0,0,2,0,3,1,1,1,0,3,0,0,0,1,2,1,0,0,1,1,1,1,0,1,0,0,0,1,0,0,3,0,0,0,0,3,2,0,2,2,0,1,0,0,0,2,3,2,3,3,0,0,0,0,2,1,0,},
|
||||
{ 0,5,1,5,0,3,0,3,0,5,4,4,5,1,5,3,3,0,4,3,4,3,5,3,4,3,3,2,4,3,4,3,3,0,3,3,1,4,4,3,4,4,4,3,4,5,5,3,2,3,1,1,3,3,1,3,1,1,3,3,2,4,5,3,3,5,0,4,0,3,0,4,4,3,5,3,3,0,3,4,0,4,3,},
|
||||
{ 0,5,0,5,0,3,0,2,0,4,4,3,5,2,4,3,3,3,4,4,4,3,5,3,5,3,3,1,4,0,4,3,3,0,3,3,0,4,4,4,4,5,4,3,3,5,5,3,2,3,1,2,3,2,0,1,0,0,3,2,2,4,4,3,1,5,0,4,0,3,0,4,3,1,3,2,1,0,3,3,0,3,3,},
|
||||
{ 0,4,0,5,0,5,0,4,0,4,5,5,5,3,4,3,3,2,5,4,4,3,5,3,5,3,4,0,4,3,4,4,3,2,4,4,3,4,5,4,4,5,5,0,3,5,5,4,1,3,3,2,3,3,1,3,1,0,4,3,1,4,4,3,4,5,0,4,0,2,0,4,3,4,4,3,3,0,4,0,0,5,5,},
|
||||
{ 0,4,0,4,0,5,0,1,1,3,3,4,4,3,4,1,3,0,5,1,3,0,3,1,3,1,1,0,3,0,3,3,4,0,4,3,0,4,4,4,3,4,4,0,3,5,4,1,0,3,0,0,2,3,0,3,1,0,3,1,0,3,2,1,3,5,0,3,0,1,0,3,2,3,3,4,4,0,2,2,0,4,4,},
|
||||
{ 2,4,0,5,0,4,0,3,0,4,5,5,4,3,5,3,5,3,5,3,5,2,5,3,4,3,3,4,3,4,5,3,2,1,5,4,3,2,3,4,5,3,4,1,2,5,4,3,0,3,3,0,3,2,0,2,3,0,4,1,0,3,4,3,3,5,0,3,0,1,0,4,5,5,5,4,3,0,4,2,0,3,5,},
|
||||
{ 0,5,0,4,0,4,0,2,0,5,4,3,4,3,4,3,3,3,4,3,4,2,5,3,5,3,4,1,4,3,4,4,4,0,3,5,0,4,4,4,4,5,3,1,3,4,5,3,3,3,3,3,3,3,0,2,2,0,3,3,2,4,3,3,3,5,3,4,1,3,3,5,3,2,0,0,0,0,4,3,1,3,3,},
|
||||
{ 0,1,0,3,0,3,0,1,0,1,3,3,3,2,3,3,3,0,3,0,0,0,3,1,3,0,0,0,2,2,2,3,0,0,3,2,0,1,2,4,1,3,3,0,0,3,3,3,0,1,0,0,2,1,0,0,3,0,3,1,0,3,0,0,1,3,0,2,0,1,0,3,3,1,3,3,0,0,1,1,0,3,3,},
|
||||
{ 0,2,0,3,0,2,1,4,0,2,2,3,1,1,3,1,1,0,2,0,3,1,2,3,1,3,0,0,1,0,4,3,2,3,3,3,1,4,2,3,3,3,3,1,0,3,1,4,0,1,1,0,1,2,0,1,1,0,1,1,0,3,1,3,2,2,0,1,0,0,0,2,3,3,3,1,0,0,0,0,0,2,3,},
|
||||
{ 0,5,0,4,0,5,0,2,0,4,5,5,3,3,4,3,3,1,5,4,4,2,4,4,4,3,4,2,4,3,5,5,4,3,3,4,3,3,5,5,4,5,5,1,3,4,5,3,1,4,3,1,3,3,0,3,3,1,4,3,1,4,5,3,3,5,0,4,0,3,0,5,3,3,1,4,3,0,4,0,1,5,3,},
|
||||
{ 0,5,0,5,0,4,0,2,0,4,4,3,4,3,3,3,3,3,5,4,4,4,4,4,4,5,3,3,5,2,4,4,4,3,4,4,3,3,4,4,5,5,3,3,4,3,4,3,3,4,3,3,3,3,1,2,2,1,4,3,3,5,4,4,3,4,0,4,0,3,0,4,4,4,4,4,1,0,4,2,0,2,4,},
|
||||
{ 0,4,0,4,0,3,0,1,0,3,5,2,3,0,3,0,2,1,4,2,3,3,4,1,4,3,3,2,4,1,3,3,3,0,3,3,0,0,3,3,3,5,3,3,3,3,3,2,0,2,0,0,2,0,0,2,0,0,1,0,0,3,1,2,2,3,0,3,0,2,0,4,4,3,3,4,1,0,3,0,0,2,4,},
|
||||
{ 0,0,0,4,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,1,0,2,0,1,0,0,0,0,0,3,1,3,0,3,2,0,0,0,1,0,3,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,4,0,2,0,0,0,0,0,0,2,},
|
||||
{ 0,2,1,3,0,2,0,2,0,3,3,3,3,1,3,1,3,3,3,3,3,3,4,2,2,1,2,1,4,0,4,3,1,3,3,3,2,4,3,5,4,3,3,3,3,3,3,3,0,1,3,0,2,0,0,1,0,0,1,0,0,4,2,0,2,3,0,3,3,0,3,3,4,2,3,1,4,0,1,2,0,2,3,},
|
||||
{ 0,3,0,3,0,1,0,3,0,2,3,3,3,0,3,1,2,0,3,3,2,3,3,2,3,2,3,1,3,0,4,3,2,0,3,3,1,4,3,3,2,3,4,3,1,3,3,1,1,0,1,1,0,1,0,1,0,1,0,0,0,4,1,1,0,3,0,3,1,0,2,3,3,3,3,3,1,0,0,2,0,3,3,},
|
||||
{ 0,0,0,0,0,0,0,0,0,0,3,0,2,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,3,0,3,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,2,3,0,0,0,0,0,0,0,0,3,},
|
||||
{ 0,2,0,3,1,3,0,3,0,2,3,3,3,1,3,1,3,1,3,1,3,3,3,1,3,0,2,3,1,1,4,3,3,2,3,3,1,2,2,4,1,3,3,0,1,4,2,3,0,1,3,0,3,0,0,1,3,0,2,0,0,3,3,2,1,3,0,3,0,2,0,3,4,4,4,3,1,0,3,0,0,3,3,},
|
||||
{ 0,2,0,1,0,2,0,0,0,1,3,2,2,1,3,0,1,1,3,0,3,2,3,1,2,0,2,0,1,1,3,3,3,0,3,3,1,1,2,3,2,3,3,1,2,3,2,0,0,1,0,0,0,0,0,0,3,0,1,0,0,2,1,2,1,3,0,3,0,0,0,3,4,4,4,3,2,0,2,0,0,2,4,},
|
||||
{ 0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,3,1,0,0,0,0,0,0,0,3,},
|
||||
{ 0,3,0,3,0,2,0,3,0,3,3,3,2,3,2,2,2,0,3,1,3,3,3,2,3,3,0,0,3,0,3,2,2,0,2,3,1,4,3,4,3,3,2,3,1,5,4,4,0,3,1,2,1,3,0,3,1,1,2,0,2,3,1,3,1,3,0,3,0,1,0,3,3,4,4,2,1,0,2,1,0,2,4,},
|
||||
{ 0,1,0,3,0,1,0,2,0,1,4,2,5,1,4,0,2,0,2,1,3,1,4,0,2,1,0,0,2,1,4,1,1,0,3,3,0,5,1,3,2,3,3,1,0,3,2,3,0,1,0,0,0,0,0,0,1,0,0,0,0,4,0,1,0,3,0,2,0,1,0,3,3,3,4,3,3,0,0,0,0,2,3,},
|
||||
{ 0,0,0,1,0,0,0,0,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,1,0,0,0,0,0,3,},
|
||||
{ 0,1,0,3,0,4,0,3,0,2,4,3,1,0,3,2,2,1,3,1,2,2,3,1,1,1,2,1,3,0,1,2,0,1,3,2,1,3,0,5,5,1,0,0,1,3,2,1,0,3,0,0,1,0,0,0,0,0,3,4,0,1,1,1,3,2,0,2,0,1,0,2,3,3,1,2,3,0,1,0,1,0,4,},
|
||||
{ 0,0,0,1,0,3,0,3,0,2,2,1,0,0,4,0,3,0,3,1,3,0,3,0,3,0,1,0,3,0,3,1,3,0,3,3,0,0,1,2,1,1,1,0,1,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,2,2,1,2,0,0,2,0,0,0,0,2,3,3,3,3,0,0,0,0,1,4,},
|
||||
{ 0,0,0,3,0,3,0,0,0,0,3,1,1,0,3,0,1,0,2,0,1,0,0,0,0,0,0,0,1,0,3,0,2,0,2,3,0,0,2,2,3,1,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,2,3,},
|
||||
{ 2,4,0,5,0,5,0,4,0,3,4,3,3,3,4,3,3,3,4,3,4,4,5,4,5,5,5,2,3,0,5,5,4,1,5,4,3,1,5,4,3,4,4,3,3,4,3,3,0,3,2,0,2,3,0,3,0,0,3,3,0,5,3,2,3,3,0,3,0,3,0,3,4,5,4,5,3,0,4,3,0,3,4,},
|
||||
{ 0,3,0,3,0,3,0,3,0,3,3,4,3,2,3,2,3,0,4,3,3,3,3,3,3,3,3,0,3,2,4,3,3,1,3,4,3,4,4,4,3,4,4,3,2,4,4,1,0,2,0,0,1,1,0,2,0,0,3,1,0,5,3,2,1,3,0,3,0,1,2,4,3,2,4,3,3,0,3,2,0,4,4,},
|
||||
{ 0,3,0,3,0,1,0,0,0,1,4,3,3,2,3,1,3,1,4,2,3,2,4,2,3,4,3,0,2,2,3,3,3,0,3,3,3,0,3,4,1,3,3,0,3,4,3,3,0,1,1,0,1,0,0,0,4,0,3,0,0,3,1,2,1,3,0,4,0,1,0,4,3,3,4,3,3,0,2,0,0,3,3,},
|
||||
{ 0,3,0,4,0,1,0,3,0,3,4,3,3,0,3,3,3,1,3,1,3,3,4,3,3,3,0,0,3,1,5,3,3,1,3,3,2,5,4,3,3,4,5,3,2,5,3,4,0,1,0,0,0,0,0,2,0,0,1,1,0,4,2,2,1,3,0,3,0,2,0,4,4,3,5,3,2,0,1,1,0,3,4,},
|
||||
{ 0,5,0,4,0,5,0,2,0,4,4,3,3,2,3,3,3,1,4,3,4,1,5,3,4,3,4,0,4,2,4,3,4,1,5,4,0,4,4,4,4,5,4,1,3,5,4,2,1,4,1,1,3,2,0,3,1,0,3,2,1,4,3,3,3,4,0,4,0,3,0,4,4,4,3,3,3,0,4,2,0,3,4,},
|
||||
{ 1,4,0,4,0,3,0,1,0,3,3,3,1,1,3,3,2,2,3,3,1,0,3,2,2,1,2,0,3,1,2,1,2,0,3,2,0,2,2,3,3,4,3,0,3,3,1,2,0,1,1,3,1,2,0,0,3,0,1,1,0,3,2,2,3,3,0,3,0,0,0,2,3,3,4,3,3,0,1,0,0,1,4,},
|
||||
{ 0,4,0,4,0,4,0,0,0,3,4,4,3,1,4,2,3,2,3,3,3,1,4,3,4,0,3,0,4,2,3,3,2,2,5,4,2,1,3,4,3,4,3,1,3,3,4,2,0,2,1,0,3,3,0,0,2,0,3,1,0,4,4,3,4,3,0,4,0,1,0,2,4,4,4,4,4,0,3,2,0,3,3,},
|
||||
{ 0,0,0,1,0,4,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,3,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,},
|
||||
{ 0,2,0,3,0,4,0,4,0,1,3,3,3,0,4,0,2,1,2,1,1,1,2,0,3,1,1,0,1,0,3,1,0,0,3,3,2,0,1,1,0,0,0,0,0,1,0,2,0,2,2,0,3,1,0,0,1,0,1,1,0,1,2,0,3,0,0,0,0,1,0,0,3,3,4,3,1,0,1,0,3,0,2,},
|
||||
{ 0,0,0,3,0,5,0,0,0,0,1,0,2,0,3,1,0,1,3,0,0,0,2,0,0,0,1,0,0,0,1,1,0,0,4,0,0,0,2,3,0,1,4,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,3,},
|
||||
{ 0,2,0,5,0,5,0,1,0,2,4,3,3,2,5,1,3,2,3,3,3,0,4,1,2,0,3,0,4,0,2,2,1,1,5,3,0,0,1,4,2,3,2,0,3,3,3,2,0,2,4,1,1,2,0,1,1,0,3,1,0,1,3,1,2,3,0,2,0,0,0,1,3,5,4,4,4,0,3,0,0,1,3,},
|
||||
{ 0,4,0,5,0,4,0,4,0,4,5,4,3,3,4,3,3,3,4,3,4,4,5,3,4,5,4,2,4,2,3,4,3,1,4,4,1,3,5,4,4,5,5,4,4,5,5,5,2,3,3,1,4,3,1,3,3,0,3,3,1,4,3,4,4,4,0,3,0,4,0,3,3,4,4,5,0,0,4,3,0,4,5,},
|
||||
{ 0,4,0,4,0,3,0,3,0,3,4,4,4,3,3,2,4,3,4,3,4,3,5,3,4,3,2,1,4,2,4,4,3,1,3,4,2,4,5,5,3,4,5,4,1,5,4,3,0,3,2,2,3,2,1,3,1,0,3,3,3,5,3,3,3,5,4,4,2,3,3,4,3,3,3,2,1,0,3,2,1,4,3,},
|
||||
{ 0,4,0,5,0,4,0,3,0,3,5,5,3,2,4,3,4,0,5,4,4,1,4,4,4,3,3,3,4,3,5,5,2,3,3,4,1,2,5,5,3,5,5,2,3,5,5,4,0,3,2,0,3,3,1,1,5,1,4,1,0,4,3,2,3,5,0,4,0,3,0,5,4,3,4,3,0,0,4,1,0,4,4,},
|
||||
{ 1,3,0,4,0,2,0,2,0,2,5,5,3,3,3,3,3,0,4,2,3,4,4,4,3,4,0,0,3,4,5,4,3,3,3,3,2,5,5,4,5,5,5,4,3,5,5,5,1,3,1,0,1,0,0,3,2,0,4,2,0,5,2,3,2,4,1,3,0,3,0,4,5,4,5,4,3,0,4,2,0,5,4,},
|
||||
{ 0,3,0,4,0,5,0,3,0,3,4,4,3,2,3,2,3,3,3,3,3,2,4,3,3,2,2,0,3,3,3,3,3,1,3,3,3,0,4,4,3,4,4,1,1,4,4,2,0,3,1,0,1,1,0,4,1,0,2,3,1,3,3,1,3,4,0,3,0,1,0,3,1,3,0,0,1,0,2,0,0,4,4,},
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
|
||||
{ 0,3,0,3,0,2,0,3,0,1,5,4,3,3,3,1,4,2,1,2,3,4,4,2,4,4,5,0,3,1,4,3,4,0,4,3,3,3,2,3,2,5,3,4,3,2,2,3,0,0,3,0,2,1,0,1,2,0,0,0,0,2,1,1,3,1,0,2,0,4,0,3,4,4,4,5,2,0,2,0,0,1,3,},
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,1,1,0,0,0,4,2,1,1,0,1,0,3,2,0,0,3,1,1,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,2,0,0,0,1,4,0,4,2,1,0,0,0,0,0,1,},
|
||||
{ 0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,3,1,0,0,0,2,0,2,1,0,0,1,2,1,0,1,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,3,1,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,2,},
|
||||
{ 0,4,0,4,0,4,0,3,0,4,4,3,4,2,4,3,2,0,4,4,4,3,5,3,5,3,3,2,4,2,4,3,4,3,1,4,0,2,3,4,4,4,3,3,3,4,4,4,3,4,1,3,4,3,2,1,2,1,3,3,3,4,4,3,3,5,0,4,0,3,0,4,3,3,3,2,1,0,3,0,0,3,3,},
|
||||
{ 0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1,},
|
||||
};
|
||||
|
||||
// category counters, each integer counts sequence in its category
|
||||
int[] relSample = new int[CATEGORIES_NUM];
|
||||
|
||||
// total sequence received
|
||||
int totalRel;
|
||||
|
||||
// The order of previous char
|
||||
int lastCharOrder;
|
||||
|
||||
// if last byte in current buffer is not the last byte of a character,
|
||||
// we need to know how many byte to skip in next buffer.
|
||||
int needToSkipCharNum;
|
||||
|
||||
// If this flag is set to true, detection is done and conclusion has
|
||||
// been made
|
||||
bool done;
|
||||
|
||||
public JapaneseContextAnalyser()
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
|
||||
public float GetConfidence()
|
||||
{
|
||||
// This is just one way to calculate confidence. It works well for me.
|
||||
if (totalRel > MINIMUM_DATA_THRESHOLD)
|
||||
return ((float)(totalRel - relSample[0]))/totalRel;
|
||||
else
|
||||
return DONT_KNOW;
|
||||
}
|
||||
|
||||
public void HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
|
||||
int charLen = 0;
|
||||
int max = offset + len;
|
||||
|
||||
if (done)
|
||||
return;
|
||||
|
||||
// The buffer we got is byte oriented, and a character may span
|
||||
// more than one buffer. In case the last one or two byte in last
|
||||
// buffer is not complete, we record how many byte needed to
|
||||
// complete that character and skip these bytes here. We can choose
|
||||
// to record those bytes as well and analyse the character once it
|
||||
// is complete, but since a character will not make much difference,
|
||||
// skipping it will simplify our logic and improve performance.
|
||||
for (int i = needToSkipCharNum+offset; i < max; ) {
|
||||
int order = GetOrder(buf, i, out charLen);
|
||||
i += charLen;
|
||||
if (i > max) {
|
||||
needToSkipCharNum = i - max;
|
||||
lastCharOrder = -1;
|
||||
} else {
|
||||
if (order != -1 && lastCharOrder != -1) {
|
||||
totalRel ++;
|
||||
if (totalRel > MAX_REL_THRESHOLD) {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
relSample[jp2CharContext[lastCharOrder, order]]++;
|
||||
}
|
||||
lastCharOrder = order;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void HandleOneChar(byte[] buf, int offset, int charLen)
|
||||
{
|
||||
if (totalRel > MAX_REL_THRESHOLD)
|
||||
done = true;
|
||||
if (done)
|
||||
return;
|
||||
|
||||
// Only 2-bytes characters are of our interest
|
||||
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
|
||||
if (order != -1 && lastCharOrder != -1) {
|
||||
totalRel++;
|
||||
// count this sequence to its category counter
|
||||
relSample[jp2CharContext[lastCharOrder, order]]++;
|
||||
}
|
||||
lastCharOrder = order;
|
||||
}
|
||||
|
||||
public void Reset()
|
||||
{
|
||||
totalRel = 0;
|
||||
for (int i = 0; i < CATEGORIES_NUM; i++) {
|
||||
relSample[i] = 0;
|
||||
needToSkipCharNum = 0;
|
||||
lastCharOrder = -1;
|
||||
done = false;
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract int GetOrder(byte[] buf, int offset, out int charLen);
|
||||
|
||||
protected abstract int GetOrder(byte[] buf, int offset);
|
||||
|
||||
public bool GotEnoughData()
|
||||
{
|
||||
return totalRel > ENOUGH_REL_THRESHOLD;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class SJISContextAnalyser : JapaneseContextAnalyser
|
||||
{
|
||||
private const byte HIRAGANA_FIRST_BYTE = 0x82;
|
||||
|
||||
protected override int GetOrder(byte[] buf, int offset, out int charLen)
|
||||
{
|
||||
//find out current char's byte length
|
||||
if (buf[offset] >= 0x81 && buf[offset] <= 0x9F
|
||||
|| buf[offset] >= 0xe0 && buf[offset] <= 0xFC)
|
||||
charLen = 2;
|
||||
else
|
||||
charLen = 1;
|
||||
|
||||
// return its order if it is hiragana
|
||||
if (buf[offset] == HIRAGANA_FIRST_BYTE) {
|
||||
byte low = buf[offset+1];
|
||||
if (low >= 0x9F && low <= 0xF1)
|
||||
return low - 0x9F;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
protected override int GetOrder(byte[] buf, int offset)
|
||||
{
|
||||
// We are only interested in Hiragana
|
||||
if (buf[offset] == HIRAGANA_FIRST_BYTE) {
|
||||
byte low = buf[offset+1];
|
||||
if (low >= 0x9F && low <= 0xF1)
|
||||
return low - 0x9F;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class EUCJPContextAnalyser : JapaneseContextAnalyser
|
||||
{
|
||||
private const byte HIRAGANA_FIRST_BYTE = 0xA4;
|
||||
|
||||
protected override int GetOrder(byte[] buf, int offset, out int charLen)
|
||||
{
|
||||
byte high = buf[offset];
|
||||
|
||||
//find out current char's byte length
|
||||
if (high == 0x8E || high >= 0xA1 && high <= 0xFE)
|
||||
charLen = 2;
|
||||
else if (high == 0xBF)
|
||||
charLen = 3;
|
||||
else
|
||||
charLen = 1;
|
||||
|
||||
// return its order if it is hiragana
|
||||
if (high == HIRAGANA_FIRST_BYTE) {
|
||||
byte low = buf[offset+1];
|
||||
if (low >= 0xA1 && low <= 0xF3)
|
||||
return low - 0xA1;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
protected override int GetOrder(byte[] buf, int offset)
|
||||
{
|
||||
// We are only interested in Hiragana
|
||||
if (buf[offset] == HIRAGANA_FIRST_BYTE) {
|
||||
byte low = buf[offset+1];
|
||||
if (low >= 0xA1 && low <= 0xF3)
|
||||
return low - 0xA1;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,246 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class BulgarianModel : SequenceModel
|
||||
{
|
||||
//Model Table:
|
||||
//total sequences: 100%
|
||||
//first 512 sequences: 96.9392%
|
||||
//first 1024 sequences:3.0618%
|
||||
//rest sequences: 0.2992%
|
||||
//negative sequences: 0.0020%
|
||||
private static byte[] BULGARIAN_LANG_MODEL = {
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
|
||||
3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,3,0,3,1,0,
|
||||
0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,3,3,3,0,3,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,3,3,3,0,3,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,1,3,3,3,3,2,2,2,1,1,2,0,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,2,3,2,2,3,3,1,1,2,3,3,2,3,3,3,3,2,1,2,0,2,0,3,0,0,
|
||||
0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,1,3,3,3,3,3,2,3,2,3,3,3,3,3,2,3,3,1,3,0,3,0,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,3,1,3,3,2,3,3,3,1,3,3,2,3,2,2,2,0,0,2,0,2,0,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,3,3,1,2,2,3,2,1,1,2,0,2,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,2,3,3,1,2,3,2,2,2,3,3,3,3,3,2,2,3,1,2,0,2,1,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,1,3,3,3,3,3,2,3,3,3,2,3,3,2,3,2,2,2,3,1,2,0,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,3,3,3,3,1,1,1,2,2,1,3,1,3,2,2,3,0,0,1,0,1,0,1,0,0,
|
||||
0,0,0,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,2,2,3,2,2,3,1,2,1,1,1,2,3,1,3,1,2,2,0,1,1,1,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,1,3,2,2,3,3,1,2,3,1,1,3,3,3,3,1,2,2,1,1,1,0,2,0,2,0,1,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,2,2,3,3,3,2,2,1,1,2,0,2,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,0,1,2,1,3,3,2,3,3,3,3,3,2,3,2,1,0,3,1,2,1,2,1,2,3,2,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,1,1,2,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,1,3,3,2,3,3,2,2,2,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,3,0,3,3,3,3,3,2,1,1,2,1,3,3,0,3,1,1,1,1,3,2,0,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,1,1,3,1,3,3,2,3,2,2,2,3,0,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,2,3,3,2,2,3,2,1,1,1,1,1,3,1,3,1,1,0,0,0,1,0,0,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,2,3,2,0,3,2,0,3,0,2,0,0,2,1,3,1,0,0,1,0,0,0,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,2,1,1,1,1,2,1,1,2,1,1,1,2,2,1,2,1,1,1,0,1,1,0,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,2,1,3,1,1,2,1,3,2,1,1,0,1,2,3,2,1,1,1,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,3,2,2,1,0,1,0,0,1,0,0,0,2,1,0,3,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,2,3,2,3,3,1,3,2,1,1,1,2,1,1,2,1,3,0,1,0,0,0,1,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,1,2,2,3,3,2,3,2,2,2,3,1,2,2,1,1,2,1,1,2,2,0,1,1,0,1,0,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,2,1,3,1,0,2,2,1,3,2,1,0,0,2,0,2,0,1,0,0,0,0,0,0,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,1,2,0,2,3,1,2,3,2,0,1,3,1,2,1,1,1,0,0,1,0,0,2,2,2,3,
|
||||
2,2,2,2,1,2,1,1,2,2,1,1,2,0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,1,0,1,
|
||||
3,3,3,3,3,2,1,2,2,1,2,0,2,0,1,0,1,2,1,2,1,1,0,0,0,1,0,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,2,3,3,1,1,3,1,0,3,2,1,0,0,0,1,2,0,2,0,1,0,0,0,1,0,1,2,1,2,2,
|
||||
1,1,1,1,1,1,1,2,2,2,1,1,1,1,1,1,1,0,1,2,1,1,1,0,0,0,0,0,1,1,0,0,
|
||||
3,1,0,1,0,2,3,2,2,2,3,2,2,2,2,2,1,0,2,1,2,1,1,1,0,1,2,1,2,2,2,1,
|
||||
1,1,2,2,2,2,1,2,1,1,0,1,2,1,2,2,2,1,1,1,0,1,1,1,1,2,0,1,0,0,0,0,
|
||||
2,3,2,3,3,0,0,2,1,0,2,1,0,0,0,0,2,3,0,2,0,0,0,0,0,1,0,0,2,0,1,2,
|
||||
2,1,2,1,2,2,1,1,1,2,1,1,1,0,1,2,2,1,1,1,1,1,0,1,1,1,0,0,1,2,0,0,
|
||||
3,3,2,2,3,0,2,3,1,1,2,0,0,0,1,0,0,2,0,2,0,0,0,1,0,1,0,1,2,0,2,2,
|
||||
1,1,1,1,2,1,0,1,2,2,2,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,1,0,0,
|
||||
2,3,2,3,3,0,0,3,0,1,1,0,1,0,0,0,2,2,1,2,0,0,0,0,0,0,0,0,2,0,1,2,
|
||||
2,2,1,1,1,1,1,2,2,2,1,0,2,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,
|
||||
3,3,3,3,2,2,2,2,2,0,2,1,1,1,1,2,1,2,1,1,0,2,0,1,0,1,0,0,2,0,1,2,
|
||||
1,1,1,1,1,1,1,2,2,1,1,0,2,0,1,0,2,0,0,1,1,1,0,0,2,0,0,0,1,1,0,0,
|
||||
2,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,2,0,0,1,1,0,0,0,0,0,0,1,2,0,1,2,
|
||||
2,2,2,1,1,2,1,1,2,2,2,1,2,0,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,1,0,0,
|
||||
2,3,3,3,3,0,2,2,0,2,1,0,0,0,1,1,1,2,0,2,0,0,0,3,0,0,0,0,2,0,2,2,
|
||||
1,1,1,2,1,2,1,1,2,2,2,1,2,0,1,1,1,0,1,1,1,1,0,2,1,0,0,0,1,1,0,0,
|
||||
2,3,3,3,3,0,2,1,0,0,2,0,0,0,0,0,1,2,0,2,0,0,0,0,0,0,0,0,2,0,1,2,
|
||||
1,1,1,2,1,1,1,1,2,2,2,0,1,0,1,1,1,0,0,1,1,1,0,0,1,0,0,0,0,1,0,0,
|
||||
3,3,2,2,3,0,1,0,1,0,0,0,0,0,0,0,1,1,0,3,0,0,0,0,0,0,0,0,1,0,2,2,
|
||||
1,1,1,1,1,2,1,1,2,2,1,2,2,1,0,1,1,1,1,1,0,1,0,0,1,0,0,0,1,1,0,0,
|
||||
3,1,0,1,0,2,2,2,2,3,2,1,1,1,2,3,0,0,1,0,2,1,1,0,1,1,1,1,2,1,1,1,
|
||||
1,2,2,1,2,1,2,2,1,1,0,1,2,1,2,2,1,1,1,0,0,1,1,1,2,1,0,1,0,0,0,0,
|
||||
2,1,0,1,0,3,1,2,2,2,2,1,2,2,1,1,1,0,2,1,2,2,1,1,2,1,1,0,2,1,1,1,
|
||||
1,2,2,2,2,2,2,2,1,2,0,1,1,0,2,1,1,1,1,1,0,0,1,1,1,1,0,1,0,0,0,0,
|
||||
2,1,1,1,1,2,2,2,2,1,2,2,2,1,2,2,1,1,2,1,2,3,2,2,1,1,1,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,3,2,0,1,2,0,1,2,1,1,0,1,0,1,2,1,2,0,0,0,1,1,0,0,0,1,0,0,2,
|
||||
1,1,0,0,1,1,0,1,1,1,1,0,2,0,1,1,1,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,
|
||||
2,0,0,0,0,1,2,2,2,2,2,2,2,1,2,1,1,1,1,1,1,1,0,1,1,1,1,1,2,1,1,1,
|
||||
1,2,2,2,2,1,1,2,1,2,1,1,1,0,2,1,2,1,1,1,0,2,1,1,1,1,0,1,0,0,0,0,
|
||||
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,
|
||||
1,1,0,1,0,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,3,2,0,0,0,0,1,0,0,0,0,0,0,1,1,0,2,0,0,0,0,0,0,0,0,1,0,1,2,
|
||||
1,1,1,1,1,1,0,0,2,2,2,2,2,0,1,1,0,1,1,1,1,1,0,0,1,0,0,0,1,1,0,1,
|
||||
2,3,1,2,1,0,1,1,0,2,2,2,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,0,1,0,1,2,
|
||||
1,1,1,1,2,1,1,1,1,1,1,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,
|
||||
2,2,2,2,2,0,0,2,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,2,2,
|
||||
1,1,1,1,1,0,0,1,2,1,1,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,2,2,2,0,0,2,0,1,1,0,0,0,1,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,1,1,
|
||||
0,0,0,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,2,3,2,0,0,1,0,0,1,0,0,0,0,0,0,1,0,2,0,0,0,1,0,0,0,0,0,0,0,2,
|
||||
1,1,0,0,1,0,0,0,1,1,0,0,1,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,1,2,2,2,1,2,1,2,2,1,1,2,1,1,1,0,1,1,1,1,2,0,1,0,1,1,1,1,0,1,1,
|
||||
1,1,2,1,1,1,1,1,1,0,0,1,2,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,3,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,2,1,0,0,1,0,2,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,2,0,0,1,
|
||||
0,2,0,1,0,0,1,1,2,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,2,2,2,0,1,1,0,2,1,0,1,1,1,0,0,1,0,2,0,1,0,0,0,0,0,0,0,0,0,1,
|
||||
0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,2,2,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
0,1,0,1,1,1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,0,1,0,0,1,2,1,1,1,1,1,1,2,2,1,0,0,1,0,1,0,0,0,0,1,1,1,1,0,0,0,
|
||||
1,1,2,1,1,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,1,2,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
|
||||
0,1,1,0,1,1,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,2,0,0,2,0,1,0,0,1,0,0,1,
|
||||
1,1,0,0,1,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,
|
||||
1,1,1,1,1,1,1,2,0,0,0,0,0,0,2,1,0,1,1,0,0,1,1,1,0,1,0,0,0,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
|
||||
};
|
||||
|
||||
public BulgarianModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, BULGARIAN_LANG_MODEL, 0.969392f, false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Latin5BulgarianModel : BulgarianModel
|
||||
{
|
||||
//255: Control characters that usually does not exist in any text
|
||||
//254: Carriage/Return
|
||||
//253: symbol (punctuation) that does not belong to word
|
||||
//252: 0 - 9
|
||||
// Character Mapping Table:
|
||||
// this table is modified base on win1251BulgarianCharToOrderMap, so
|
||||
// only number <64 is sure valid
|
||||
private static byte[] LATIN5_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40
|
||||
110,186,108, 91, 74,119, 84, 96,111,187,115,253,253,253,253,253, //50
|
||||
253, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60
|
||||
116,195, 85, 93, 97,113,196,197,198,199,200,253,253,253,253,253, //70
|
||||
194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209, //80
|
||||
210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225, //90
|
||||
81,226,227,228,229,230,105,231,232,233,234,235,236, 45,237,238, //a0
|
||||
31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //b0
|
||||
39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,239, 67,240, 60, 56, //c0
|
||||
1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //d0
|
||||
7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,241, 42, 16, //e0
|
||||
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0
|
||||
};
|
||||
|
||||
public Latin5BulgarianModel() : base(LATIN5_CHAR_TO_ORDER_MAP, "ISO-8859-5")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Win1251BulgarianModel : BulgarianModel
|
||||
{
|
||||
private static byte[] WIN1251__CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40
|
||||
110,186,108, 91, 74,119, 84, 96,111,187,115,253,253,253,253,253, //50
|
||||
253, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60
|
||||
116,195, 85, 93, 97,113,196,197,198,199,200,253,253,253,253,253, //70
|
||||
206,207,208,209,210,211,212,213,120,214,215,216,217,218,219,220, //80
|
||||
221, 78, 64, 83,121, 98,117,105,222,223,224,225,226,227,228,229, //90
|
||||
88,230,231,232,233,122, 89,106,234,235,236,237,238, 45,239,240, //a0
|
||||
73, 80,118,114,241,242,243,244,245, 62, 58,246,247,248,249,250, //b0
|
||||
31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //c0
|
||||
39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,252, 60, 56, //d0
|
||||
1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //e0
|
||||
7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,253, 42, 16, //f0
|
||||
};
|
||||
|
||||
public Win1251BulgarianModel() : base(WIN1251__CHAR_TO_ORDER_MAP, "windows-1251")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,345 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class CyrillicModel : SequenceModel
|
||||
{
|
||||
// Model Table:
|
||||
// total sequences: 100%
|
||||
// first 512 sequences: 97.6601%
|
||||
// first 1024 sequences: 2.3389%
|
||||
// rest sequences: 0.1237%
|
||||
// negative sequences: 0.0009%
|
||||
protected readonly static byte[] RUSSIAN_LANG_MODEL = {
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
|
||||
3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,2,2,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,2,3,3,1,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,3,3,3,3,3,3,3,3,3,3,2,1,
|
||||
0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,3,3,2,1,
|
||||
0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,2,2,2,3,1,3,3,1,3,3,3,3,2,2,3,0,2,2,2,3,3,2,1,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,3,3,3,3,3,2,2,3,2,3,3,3,2,1,2,2,0,1,2,2,2,2,2,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,0,2,2,3,3,2,1,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,3,3,1,2,3,2,2,3,2,3,3,3,3,2,2,3,0,3,2,2,3,1,1,1,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,2,3,3,3,3,2,2,2,0,3,3,3,2,2,2,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,2,3,2,2,0,1,3,2,1,2,2,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,1,1,3,0,1,1,1,1,2,1,1,0,2,2,2,1,2,0,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,3,3,2,2,2,2,1,3,2,3,2,3,2,1,2,2,0,1,1,2,1,2,1,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,2,2,2,2,0,2,2,2,2,3,1,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
3,2,3,2,2,3,3,3,3,3,3,3,3,3,1,3,2,0,0,3,3,3,3,2,3,3,3,3,2,3,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,3,3,2,2,3,3,0,2,1,0,3,2,3,2,3,0,0,1,2,0,0,1,0,1,2,1,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,3,0,2,3,3,3,3,2,3,3,3,3,1,2,2,0,0,2,3,2,2,2,3,2,3,2,2,3,0,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,3,0,2,3,2,3,0,1,2,3,3,2,0,2,3,0,0,2,3,2,2,0,1,3,1,3,2,2,1,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,3,0,2,3,3,3,3,3,3,3,3,2,1,3,2,0,0,2,2,3,3,3,2,3,3,0,2,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,2,3,3,2,2,2,3,3,0,0,1,1,1,1,1,2,0,0,1,1,1,1,0,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,0,3,2,3,3,2,3,2,0,2,1,0,1,1,0,1,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,3,3,3,2,2,2,2,3,1,3,2,3,1,1,2,1,0,2,2,2,2,1,3,1,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
2,2,3,3,3,3,3,1,2,2,1,3,1,0,3,0,0,3,0,0,0,1,1,0,1,2,1,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,2,1,1,3,3,3,2,2,1,2,2,3,1,1,2,0,0,2,2,1,3,0,0,2,1,1,2,1,1,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,3,3,3,3,1,2,2,2,1,2,1,3,3,1,1,2,1,2,1,2,2,0,2,0,0,1,1,0,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,3,3,2,1,3,2,2,3,2,0,3,2,0,3,0,1,0,1,1,0,0,1,1,1,1,0,1,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,2,3,3,3,2,2,2,3,3,1,2,1,2,1,0,1,0,1,1,0,1,0,0,2,1,1,1,0,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
3,1,1,2,1,2,3,3,2,2,1,2,2,3,0,2,1,0,0,2,2,3,2,1,2,2,2,2,2,3,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,1,1,0,1,1,2,2,1,1,3,0,0,1,3,1,1,1,0,0,0,1,0,1,1,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,1,3,3,3,2,0,0,0,2,1,0,1,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,0,1,0,0,2,3,2,2,2,1,2,2,2,1,2,1,0,0,1,1,1,0,2,0,1,1,1,0,0,1,1,
|
||||
1,0,0,0,0,0,1,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,3,0,0,0,0,1,0,0,0,0,3,0,1,2,1,0,0,0,0,0,0,0,1,1,0,0,1,1,
|
||||
1,0,1,0,1,2,0,0,1,1,2,1,0,1,1,1,1,0,1,1,1,1,0,1,0,0,1,0,0,1,1,0,
|
||||
2,2,3,2,2,2,3,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,0,1,0,1,1,1,0,2,1,
|
||||
1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,0,1,1,0,
|
||||
3,3,3,2,2,2,2,3,2,2,1,1,2,2,2,2,1,1,3,1,2,1,2,0,0,1,1,0,1,0,2,1,
|
||||
1,1,1,1,1,2,1,0,1,1,1,1,0,1,0,0,1,1,0,0,1,0,1,0,0,1,0,0,0,1,1,0,
|
||||
2,0,0,1,0,3,2,2,2,2,1,2,1,2,1,2,0,0,0,2,1,2,2,1,1,2,2,0,1,1,0,2,
|
||||
1,1,1,1,1,0,1,1,1,2,1,1,1,2,1,0,1,2,1,1,1,1,0,1,1,1,0,0,1,0,0,1,
|
||||
1,3,2,2,2,1,1,1,2,3,0,0,0,0,2,0,2,2,1,0,0,0,0,0,0,1,0,0,0,0,1,1,
|
||||
1,0,1,1,0,1,0,1,1,0,1,1,0,2,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,
|
||||
2,3,2,3,2,1,2,2,2,2,1,0,0,0,2,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,2,1,
|
||||
1,1,2,1,0,2,0,0,1,0,1,0,0,1,0,0,1,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,
|
||||
3,0,0,1,0,2,2,2,3,2,2,2,2,2,2,2,0,0,0,2,1,2,1,1,1,2,2,0,0,0,1,2,
|
||||
1,1,1,1,1,0,1,2,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1,
|
||||
2,3,2,3,3,2,0,1,1,1,0,0,1,0,2,0,1,1,3,1,0,0,0,0,0,0,0,1,0,0,2,1,
|
||||
1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,1,1,0,0,1,1,0,1,0,0,0,0,0,0,1,0,
|
||||
2,3,3,3,3,1,2,2,2,2,0,1,1,0,2,1,1,1,2,1,0,1,1,0,0,1,0,1,0,0,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,2,0,0,1,1,2,2,1,0,0,2,0,1,1,3,0,0,1,0,0,0,0,0,1,0,1,2,1,
|
||||
1,1,2,0,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,1,0,0,0,0,0,0,1,0,1,1,0,
|
||||
1,3,2,3,2,1,0,0,2,2,2,0,1,0,2,0,1,1,1,0,1,0,0,0,3,0,1,1,0,0,2,1,
|
||||
1,1,1,0,1,1,0,0,0,0,1,1,0,1,0,0,2,1,1,0,1,0,0,0,1,0,1,0,0,1,1,0,
|
||||
3,1,2,1,1,2,2,2,2,2,2,1,2,2,1,1,0,0,0,2,2,2,0,0,0,1,2,1,0,1,0,1,
|
||||
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,2,1,1,1,0,1,0,1,1,0,1,1,1,0,0,1,
|
||||
3,0,0,0,0,2,0,1,1,1,1,1,1,1,0,1,0,0,0,1,1,1,0,1,0,1,1,0,0,1,0,1,
|
||||
1,1,0,0,1,0,0,0,1,0,1,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,
|
||||
1,3,3,2,2,0,0,0,2,2,0,0,0,1,2,0,1,1,2,0,0,0,0,0,0,0,0,1,0,0,2,1,
|
||||
0,1,1,0,0,1,1,0,0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
|
||||
2,3,2,3,2,0,0,0,0,1,1,0,0,0,2,0,2,0,2,0,0,0,0,0,1,0,0,1,0,0,1,1,
|
||||
1,1,2,0,1,2,1,0,1,1,2,1,1,1,1,1,2,1,1,0,1,0,0,1,1,1,1,1,0,1,1,0,
|
||||
1,3,2,2,2,1,0,0,2,2,1,0,1,2,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,
|
||||
0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,0,2,3,1,2,2,2,2,2,2,1,1,0,0,0,1,0,1,0,2,1,1,1,0,0,0,0,1,
|
||||
1,1,0,1,1,0,1,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
|
||||
2,0,2,0,0,1,0,3,2,1,2,1,2,2,0,1,0,0,0,2,1,0,0,2,1,1,1,1,0,2,0,2,
|
||||
2,1,1,1,1,1,1,1,1,1,1,1,1,2,1,0,1,1,1,1,0,0,0,1,1,1,1,0,1,0,0,1,
|
||||
1,2,2,2,2,1,0,0,1,0,0,0,0,0,2,0,1,1,1,1,0,0,0,0,1,0,1,2,0,0,2,0,
|
||||
1,0,1,1,1,2,1,0,1,0,1,1,0,0,1,0,1,1,1,0,1,0,0,0,1,0,0,1,0,1,1,0,
|
||||
2,1,2,2,2,0,3,0,1,1,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
0,0,0,1,1,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,
|
||||
1,2,2,3,2,2,0,0,1,1,2,0,1,2,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,
|
||||
0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,
|
||||
2,2,1,1,2,1,2,2,2,2,2,1,2,2,0,1,0,0,0,1,2,2,2,1,2,1,1,1,1,1,2,1,
|
||||
1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,1,1,1,0,0,0,0,1,1,1,0,1,1,0,0,1,
|
||||
1,2,2,2,2,0,1,0,2,2,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,
|
||||
0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,2,2,2,0,0,0,2,2,2,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,
|
||||
0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,2,2,2,0,0,0,0,1,0,0,1,1,2,0,0,0,0,1,0,1,0,0,1,0,0,2,0,0,0,1,
|
||||
0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,2,2,1,1,2,0,2,1,1,1,1,0,2,2,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,
|
||||
0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,2,1,2,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,
|
||||
0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
|
||||
1,0,0,0,0,2,0,1,2,1,0,1,1,1,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,1,
|
||||
0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
1,1,1,0,1,0,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
1,1,0,1,1,0,1,0,1,0,0,0,0,1,1,0,1,1,0,0,0,0,0,1,0,1,1,0,1,0,0,0,
|
||||
0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
|
||||
};
|
||||
|
||||
public CyrillicModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, RUSSIAN_LANG_MODEL, 0.976601f, false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Koi8rModel : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] KOI8R_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
|
||||
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
|
||||
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
|
||||
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
|
||||
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, //80
|
||||
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, //90
|
||||
223,224,225, 68,226,227,228,229,230,231,232,233,234,235,236,237, //a0
|
||||
238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253, //b0
|
||||
27, 3, 21, 28, 13, 2, 39, 19, 26, 4, 23, 11, 8, 12, 5, 1, //c0
|
||||
15, 16, 9, 7, 6, 14, 24, 10, 17, 18, 20, 25, 30, 29, 22, 54, //d0
|
||||
59, 37, 44, 58, 41, 48, 53, 46, 55, 42, 60, 36, 49, 38, 31, 34, //e0
|
||||
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0
|
||||
};
|
||||
|
||||
public Koi8rModel() : base(KOI8R_CHAR_TO_ORDER_MAP, "KOI8-R")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Win1251Model : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] WIN1251_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
|
||||
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
|
||||
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
|
||||
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
|
||||
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
|
||||
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
|
||||
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
|
||||
239,240,241,242,243,244,245,246, 68,247,248,249,250,251,252,253,
|
||||
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
|
||||
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
|
||||
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||
};
|
||||
|
||||
public Win1251Model() : base(WIN1251_CHAR_TO_ORDER_MAP, "windows-1251")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Latin5Model : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] LATIN5_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
|
||||
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
|
||||
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
|
||||
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
|
||||
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
|
||||
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
|
||||
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
|
||||
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
|
||||
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
|
||||
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
||||
};
|
||||
|
||||
public Latin5Model() : base(LATIN5_CHAR_TO_ORDER_MAP, "ISO-8859-5")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class MacCyrillicModel : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] MACCYRILLIC_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
|
||||
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
|
||||
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
|
||||
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
|
||||
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
|
||||
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
|
||||
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
|
||||
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
|
||||
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
|
||||
239,240,241,242,243,244,245,246,247,248,249,250,251,252, 68, 16,
|
||||
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
|
||||
};
|
||||
|
||||
public MacCyrillicModel() : base(MACCYRILLIC_CHAR_TO_ORDER_MAP,
|
||||
"x-mac-cyrillic")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Ibm855Model : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] IBM855_BYTE_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
|
||||
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
|
||||
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
|
||||
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
|
||||
191,192,193,194, 68,195,196,197,198,199,200,201,202,203,204,205,
|
||||
206,207,208,209,210,211,212,213,214,215,216,217, 27, 59, 54, 70,
|
||||
3, 37, 21, 44, 28, 58, 13, 41, 2, 48, 39, 53, 19, 46,218,219,
|
||||
220,221,222,223,224, 26, 55, 4, 42,225,226,227,228, 23, 60,229,
|
||||
230,231,232,233,234,235, 11, 36,236,237,238,239,240,241,242,243,
|
||||
8, 49, 12, 38, 5, 31, 1, 34, 15,244,245,246,247, 35, 16,248,
|
||||
43, 9, 45, 7, 32, 6, 40, 14, 52, 24, 56, 10, 33, 17, 61,249,
|
||||
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
|
||||
};
|
||||
|
||||
public Ibm855Model() : base(IBM855_BYTE_TO_ORDER_MAP, "IBM855")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Ibm866Model : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] IBM866_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
|
||||
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
|
||||
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
|
||||
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
|
||||
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
|
||||
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
|
||||
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
|
||||
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
|
||||
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
|
||||
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
||||
};
|
||||
|
||||
public Ibm866Model() : base(IBM866_CHAR_TO_ORDER_MAP, "IBM866")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,244 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class GreekModel : SequenceModel
|
||||
{
|
||||
// Model Table:
|
||||
// total sequences: 100%
|
||||
// first 512 sequences: 98.2851%
|
||||
// first 1024 sequences:1.7001%
|
||||
// rest sequences: 0.0359%
|
||||
// negative sequences: 0.0148%
|
||||
private readonly static byte[] GREEK_LANG_MODEL = {
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0,
|
||||
3,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,0,3,3,0,3,2,3,3,0,3,2,3,3,3,0,0,3,0,3,0,3,3,2,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,
|
||||
0,2,3,2,2,3,3,3,3,3,3,3,3,0,3,3,3,3,0,2,3,3,0,3,3,3,3,2,3,3,3,0,
|
||||
2,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,2,1,3,3,3,3,2,3,3,2,3,3,2,0,
|
||||
0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,2,3,3,0,
|
||||
2,0,1,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,2,3,0,0,0,0,3,3,0,3,1,3,3,3,0,3,3,0,3,3,3,3,0,0,0,0,
|
||||
2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,0,3,0,3,3,3,3,3,0,3,2,2,2,3,0,2,3,3,3,3,3,2,3,3,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,3,2,2,2,3,3,3,3,0,3,1,3,3,3,3,2,3,3,3,3,3,3,3,2,2,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,2,0,3,0,0,0,3,3,2,3,3,3,3,3,0,0,3,2,3,0,2,3,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,3,3,3,3,0,0,3,3,0,2,3,0,3,0,3,3,3,0,0,3,0,3,0,2,2,3,3,0,0,
|
||||
0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,2,0,3,2,3,3,3,3,0,3,3,3,3,3,0,3,3,2,3,2,3,3,2,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,2,3,2,3,3,3,3,3,3,0,2,3,2,3,2,2,2,3,2,3,3,2,3,0,2,2,2,3,0,
|
||||
2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,0,0,0,3,3,3,2,3,3,0,0,3,0,3,0,0,0,3,2,0,3,0,3,0,0,2,0,2,0,
|
||||
0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,0,0,0,3,3,0,3,3,3,0,0,1,2,3,0,
|
||||
3,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,2,0,0,3,2,2,3,3,0,3,3,3,3,3,2,1,3,0,3,2,3,3,2,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,3,0,2,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,3,0,3,2,3,0,0,3,3,3,0,
|
||||
3,0,0,0,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,0,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,2,0,3,2,3,0,0,3,2,3,0,
|
||||
2,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,1,2,2,3,3,3,3,3,3,0,2,3,0,3,0,0,0,3,3,0,3,0,2,0,0,2,3,1,0,
|
||||
2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,3,3,3,3,0,3,0,3,3,2,3,0,3,3,3,3,3,3,0,3,3,3,0,2,3,0,0,3,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,3,3,3,0,0,3,0,0,0,3,3,0,3,0,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,0,0,0,3,3,3,3,3,3,0,0,3,0,2,0,0,0,3,3,0,3,0,3,0,0,2,0,2,0,
|
||||
0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,3,0,3,0,2,0,3,2,0,3,2,3,2,3,0,0,3,2,3,2,3,3,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,0,0,2,3,3,3,3,3,0,0,0,3,0,2,1,0,0,3,2,2,2,0,3,0,0,2,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,3,3,3,2,0,3,0,3,0,3,3,0,2,1,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,3,3,3,0,3,3,3,3,3,3,0,2,3,0,3,0,0,0,2,1,0,2,2,3,0,0,2,2,2,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,0,0,2,3,3,3,2,3,0,0,1,3,0,2,0,0,0,0,3,0,1,0,2,0,0,1,1,1,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,1,0,3,0,0,0,3,2,0,3,2,3,3,3,0,0,3,0,3,2,2,2,1,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,3,3,3,0,0,3,0,0,0,0,2,0,2,3,3,2,2,2,2,3,0,2,0,2,2,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,2,0,0,0,0,0,0,2,3,0,2,0,2,3,2,0,0,3,0,3,0,3,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,3,2,3,3,2,2,3,0,2,0,3,0,0,0,2,0,0,0,0,1,2,0,2,0,2,0,
|
||||
0,2,0,2,0,2,2,0,0,1,0,2,2,2,0,2,2,2,0,2,2,2,0,0,2,0,0,1,0,0,0,0,
|
||||
0,2,0,3,3,2,0,0,0,0,0,0,1,3,0,2,0,2,2,2,0,0,2,0,3,0,0,2,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,2,3,2,0,2,2,0,2,0,2,2,0,2,0,2,2,2,0,0,0,0,0,0,2,3,0,0,0,2,
|
||||
0,1,2,0,0,0,0,2,2,0,0,0,2,1,0,2,2,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,
|
||||
0,0,2,1,0,2,3,2,2,3,2,3,2,0,0,3,3,3,0,0,3,2,0,0,0,1,1,0,2,0,2,2,
|
||||
0,2,0,2,0,2,2,0,0,2,0,2,2,2,0,2,2,2,2,0,0,2,0,0,0,2,0,1,0,0,0,0,
|
||||
0,3,0,3,3,2,2,0,3,0,0,0,2,2,0,2,2,2,1,2,0,0,1,2,2,0,0,3,0,0,0,2,
|
||||
0,1,2,0,0,0,1,2,0,0,0,0,0,0,0,2,2,0,1,0,0,2,0,0,0,2,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,3,3,2,2,0,0,0,2,0,2,3,3,0,2,0,0,0,0,0,0,2,2,2,0,2,2,0,2,0,2,
|
||||
0,2,2,0,0,2,2,2,2,1,0,0,2,2,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,
|
||||
0,2,0,3,2,3,0,0,0,3,0,0,2,2,0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,0,2,
|
||||
0,0,2,2,0,0,2,2,2,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,2,0,0,3,2,0,2,2,2,2,2,0,0,0,2,0,0,0,0,2,0,1,0,0,2,0,1,0,0,0,
|
||||
0,2,2,2,0,2,2,0,1,2,0,2,2,2,0,2,2,2,2,1,2,2,0,0,2,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,2,0,2,0,2,2,0,0,0,0,1,2,1,0,0,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,3,2,3,0,0,2,0,0,0,2,2,0,2,0,0,0,1,0,0,2,0,2,0,2,2,0,0,0,0,
|
||||
0,0,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,
|
||||
0,2,2,3,2,2,0,0,0,0,0,0,1,3,0,2,0,2,2,0,0,0,1,0,2,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,0,2,0,3,2,0,2,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
0,0,2,0,0,0,0,1,1,0,0,2,1,2,0,2,2,0,1,0,0,1,0,0,0,2,0,0,0,0,0,0,
|
||||
0,3,0,2,2,2,0,0,2,0,0,0,2,0,0,0,2,3,0,2,0,0,0,0,0,0,2,2,0,0,0,2,
|
||||
0,1,2,0,0,0,1,2,2,1,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,1,2,0,2,2,0,2,0,0,2,0,0,0,0,1,2,1,0,2,1,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,2,0,0,0,3,1,2,2,0,2,0,0,0,0,2,0,0,0,2,0,0,3,0,0,0,0,2,2,2,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,1,0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,2,
|
||||
0,2,2,0,0,2,2,2,2,2,0,1,2,0,0,0,2,2,0,1,0,2,0,0,2,2,0,0,0,0,0,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0,2,
|
||||
0,1,2,0,0,0,0,2,2,1,0,1,0,1,0,2,2,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,
|
||||
0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,2,0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,2,
|
||||
0,2,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,
|
||||
0,2,2,2,2,0,0,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,1,
|
||||
0,0,2,0,0,0,0,1,2,0,0,0,0,0,0,2,2,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,
|
||||
0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,2,2,2,0,0,0,2,0,0,0,0,0,0,0,0,2,
|
||||
0,0,1,0,0,0,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,2,
|
||||
0,0,2,0,0,0,0,2,2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,0,2,2,1,0,0,0,0,0,0,2,0,0,2,0,2,2,2,0,0,0,0,0,0,2,0,0,0,0,2,
|
||||
0,0,2,0,0,2,0,2,2,0,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,
|
||||
0,0,3,0,0,0,2,2,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,
|
||||
0,2,2,2,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,
|
||||
0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,2,0,0,0,2,0,0,0,0,0,1,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,2,0,0,0,
|
||||
0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,2,0,2,0,0,0,
|
||||
0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
public GreekModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, GREEK_LANG_MODEL, 0.982851f, false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Latin7Model : GreekModel
|
||||
{
|
||||
/****************************************************************
|
||||
255: Control characters that usually does not exist in any text
|
||||
254: Carriage/Return
|
||||
253: symbol (punctuation) that does not belong to word
|
||||
252: 0 - 9
|
||||
*****************************************************************/
|
||||
//Character Mapping Table:
|
||||
private readonly static byte[] LATIN7_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40
|
||||
79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, //50
|
||||
253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60
|
||||
78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, //70
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //80
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //90
|
||||
+253,233, 90,253,253,253,253,253,253,253,253,253,253, 74,253,253, //a0
|
||||
253,253,253,253,247,248, 61, 36, 46, 71, 73,253, 54,253,108,123, //b0
|
||||
110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0
|
||||
35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0
|
||||
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0
|
||||
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0
|
||||
};
|
||||
|
||||
public Latin7Model() : base(LATIN7_CHAR_TO_ORDER_MAP, "ISO-8859-7")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Win1253Model : GreekModel
|
||||
{
|
||||
private readonly static byte[] WIN1253__CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40
|
||||
79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, //50
|
||||
253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60
|
||||
78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, //70
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //80
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //90
|
||||
+253,233, 61,253,253,253,253,253,253,253,253,253,253, 74,253,253, //a0
|
||||
253,253,253,253,247,253,253, 36, 46, 71, 73,253, 54,253,108,123, //b0
|
||||
110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0
|
||||
35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0
|
||||
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0
|
||||
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0
|
||||
};
|
||||
|
||||
public Win1253Model() : base(WIN1253__CHAR_TO_ORDER_MAP, "windows-1253")
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,220 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class HebrewModel : SequenceModel
|
||||
{
|
||||
//Model Table:
|
||||
//total sequences: 100%
|
||||
//first 512 sequences: 98.4004%
|
||||
//first 1024 sequences: 1.5981%
|
||||
//rest sequences: 0.087%
|
||||
//negative sequences: 0.0015%
|
||||
private readonly static byte[] HEBREW_LANG_MODEL = {
|
||||
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
|
||||
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,
|
||||
1,2,1,2,1,2,0,0,2,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,
|
||||
1,2,1,3,1,1,0,0,2,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,1,2,2,1,3,
|
||||
1,2,1,1,2,2,0,0,2,2,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,2,2,3,2,
|
||||
1,2,1,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,2,3,2,2,2,1,2,2,2,2,
|
||||
1,2,1,1,2,2,0,1,2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,2,2,2,2,
|
||||
0,2,0,2,2,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,2,2,
|
||||
0,2,1,2,2,2,0,0,2,1,0,0,0,0,1,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,2,3,2,2,2,
|
||||
1,2,1,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,
|
||||
3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,2,0,2,
|
||||
0,2,1,2,2,2,0,0,1,2,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,2,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,3,2,1,2,1,1,1,
|
||||
0,1,1,1,1,1,3,0,1,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,0,1,0,0,1,0,0,0,0,
|
||||
0,0,1,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,
|
||||
0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,2,3,3,3,2,1,2,3,3,2,3,3,3,3,2,3,2,1,2,0,2,1,2,
|
||||
0,2,0,2,2,2,0,0,1,2,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,2,3,3,3,1,2,2,3,3,2,3,2,3,2,2,3,1,2,2,0,2,2,2,
|
||||
0,2,1,2,2,2,0,0,1,2,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,2,2,3,3,3,3,1,3,2,2,2,
|
||||
0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,2,2,2,1,2,2,0,2,2,2,2,
|
||||
0,2,0,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,1,3,2,3,3,2,3,3,2,2,1,2,2,2,2,2,2,
|
||||
0,2,1,2,1,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,2,3,2,3,3,2,3,3,3,3,2,3,2,3,3,3,3,3,2,2,2,2,2,2,2,1,
|
||||
0,2,0,1,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,2,1,2,3,3,3,3,3,3,3,2,3,2,3,2,1,2,3,0,2,1,2,2,
|
||||
0,2,1,1,2,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,
|
||||
3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,1,3,1,2,2,2,1,2,3,3,1,2,1,2,2,2,2,
|
||||
0,1,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,0,2,3,3,3,1,3,3,3,1,2,2,2,2,1,1,2,2,2,2,2,2,
|
||||
0,2,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,2,3,3,3,2,2,3,3,3,2,1,2,3,2,3,2,2,2,2,1,2,1,1,1,2,2,
|
||||
0,2,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,1,0,0,0,0,0,
|
||||
1,0,1,0,0,0,0,0,2,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,2,3,3,2,3,1,2,2,2,2,3,2,3,1,1,2,2,1,2,2,1,1,0,2,2,2,2,
|
||||
0,1,0,1,2,2,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,0,0,1,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,0,
|
||||
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,1,0,1,0,1,1,0,1,1,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
3,2,2,1,2,2,2,2,2,2,2,1,2,2,1,2,2,1,1,1,1,1,1,1,1,2,1,1,0,3,3,3,
|
||||
0,3,0,2,2,2,2,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
2,2,2,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,1,2,2,2,1,1,1,2,0,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,0,2,2,0,0,0,0,0,0,
|
||||
0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,1,0,2,1,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
0,3,1,1,2,2,2,2,2,1,2,2,2,1,1,2,2,2,2,2,2,2,1,2,2,1,0,1,1,1,1,0,
|
||||
0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,1,1,1,1,2,1,1,2,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,
|
||||
0,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0,
|
||||
2,1,1,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,1,2,1,1,1,1,0,0,0,0,
|
||||
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,1,2,2,2,2,2,2,2,2,2,2,1,2,1,2,1,1,2,1,1,1,2,1,2,1,2,0,1,0,1,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,1,2,2,2,1,2,2,2,2,2,2,2,2,1,2,1,1,1,1,1,1,2,1,2,1,1,0,1,0,1,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,1,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,
|
||||
0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,1,1,1,1,1,1,1,0,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,2,0,1,1,1,0,1,0,0,0,1,1,0,1,1,0,0,0,0,0,1,1,0,0,
|
||||
0,1,1,1,2,1,2,2,2,0,2,0,2,0,1,1,2,1,1,1,1,2,1,0,1,1,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,1,0,0,0,0,0,1,0,1,2,2,0,1,0,0,1,1,2,2,1,2,0,2,0,0,0,1,2,0,1,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,2,0,2,1,2,0,2,0,0,1,1,1,1,1,1,0,1,0,0,0,1,0,0,1,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,0,0,0,0,0,1,0,2,1,1,0,1,0,0,1,1,1,2,2,0,0,1,0,0,0,1,0,0,1,
|
||||
1,1,2,1,0,1,1,1,0,1,0,1,1,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,2,1,
|
||||
0,2,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,1,0,0,1,0,1,1,1,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,1,1,1,1,1,1,1,1,2,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,0,1,1,0,1,0,0,0,1,1,0,1,
|
||||
2,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,1,1,2,1,1,2,0,1,0,0,0,1,1,0,1,
|
||||
1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,0,0,2,1,1,2,0,2,0,0,0,1,1,0,1,
|
||||
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,2,1,1,0,1,0,0,2,2,1,2,1,1,0,1,0,0,0,1,1,0,1,
|
||||
2,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,2,2,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,1,0,1,
|
||||
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,2,2,0,0,0,0,2,1,1,1,0,2,1,1,0,0,0,2,1,0,1,
|
||||
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,1,1,0,2,1,1,0,1,0,0,0,1,1,0,1,
|
||||
2,2,1,1,1,0,1,1,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,2,1,1,0,1,0,0,1,1,0,1,2,1,0,2,0,0,0,1,1,0,1,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,
|
||||
0,1,0,0,2,0,2,1,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,1,1,1,0,1,0,0,1,0,0,0,1,0,0,1,
|
||||
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,2,1,1,1,1,1,0,1,0,0,0,0,1,0,1,
|
||||
0,1,1,1,2,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
|
||||
};
|
||||
|
||||
public HebrewModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, HEBREW_LANG_MODEL, 0.984004f, false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Win1255Model : HebrewModel
|
||||
{
|
||||
/*
|
||||
255: Control characters that usually does not exist in any text
|
||||
254: Carriage/Return
|
||||
253: symbol (punctuation) that does not belong to word
|
||||
252: 0 - 9
|
||||
*/
|
||||
//Windows-1255 language model
|
||||
//Character Mapping Table:
|
||||
private readonly static byte[] WIN1255_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 69, 91, 79, 80, 92, 89, 97, 90, 68,111,112, 82, 73, 95, 85, //40
|
||||
78,121, 86, 71, 67,102,107, 84,114,103,115,253,253,253,253,253, //50
|
||||
253, 50, 74, 60, 61, 42, 76, 70, 64, 53,105, 93, 56, 65, 54, 49, //60
|
||||
66,110, 51, 43, 44, 63, 81, 77, 98, 75,108,253,253,253,253,253, //70
|
||||
124,202,203,204,205, 40, 58,206,207,208,209,210,211,212,213,214,
|
||||
215, 83, 52, 47, 46, 72, 32, 94,216,113,217,109,218,219,220,221,
|
||||
34,116,222,118,100,223,224,117,119,104,125,225,226, 87, 99,227,
|
||||
106,122,123,228, 55,229,230,101,231,232,120,233, 48, 39, 57,234,
|
||||
30, 59, 41, 88, 33, 37, 36, 31, 29, 35,235, 62, 28,236,126,237,
|
||||
238, 38, 45,239,240,241,242,243,127,244,245,246,247,248,249,250,
|
||||
9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23,
|
||||
12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253,
|
||||
};
|
||||
|
||||
public Win1255Model() : base(WIN1255_CHAR_TO_ORDER_MAP, "windows-1255")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,238 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class HungarianModel : SequenceModel
|
||||
{
|
||||
//Model Table:
|
||||
//total sequences: 100%
|
||||
//first 512 sequences: 94.7368%
|
||||
//first 1024 sequences:5.2623%
|
||||
//rest sequences: 0.8894%
|
||||
//negative sequences: 0.0009%
|
||||
private readonly static byte[] HUNGARIAN_LANG_MODEL = {
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
|
||||
3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,
|
||||
3,2,1,3,3,3,3,3,2,3,3,3,3,3,1,1,2,3,3,3,3,3,3,3,1,1,3,2,0,1,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,1,1,2,3,3,3,1,3,3,3,3,3,1,3,3,2,2,0,3,2,3,
|
||||
0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,3,3,3,3,2,3,3,2,2,3,2,3,2,0,3,2,2,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,1,2,3,2,2,3,1,2,3,3,2,2,0,3,3,3,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,0,2,3,2,
|
||||
0,0,0,1,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,1,1,1,3,3,2,1,3,2,2,3,2,1,3,2,2,1,0,3,3,1,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,2,2,3,3,3,3,3,1,2,3,3,3,3,1,2,1,3,3,3,3,2,2,3,1,1,3,2,0,1,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,2,1,3,3,3,3,3,2,2,1,3,3,3,0,1,1,2,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,3,3,2,0,3,2,3,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,3,1,3,2,2,2,3,1,1,3,3,1,1,0,3,3,2,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,3,2,3,3,3,3,3,1,2,3,2,2,0,2,2,2,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,2,2,2,3,1,3,3,2,2,1,3,3,3,1,1,3,1,2,3,2,3,2,2,2,1,0,2,2,2,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
|
||||
3,1,1,3,3,3,3,3,1,2,3,3,3,3,1,2,1,3,3,3,2,2,3,2,1,0,3,2,0,1,1,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,1,3,3,3,3,3,1,2,3,3,3,3,1,1,0,3,3,3,3,0,2,3,0,0,2,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,2,3,3,2,2,2,2,3,3,0,1,2,3,2,3,2,2,3,2,1,2,0,2,2,2,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,1,2,3,3,3,2,1,2,3,3,2,2,2,3,2,3,3,1,3,3,1,1,0,2,3,2,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,1,2,2,2,2,3,3,3,1,1,1,3,3,1,1,3,1,1,3,2,1,2,3,1,1,0,2,2,2,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,2,1,2,1,1,3,3,1,1,1,1,3,3,1,1,2,2,1,2,1,1,2,2,1,1,0,2,2,1,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,1,1,2,1,1,3,3,1,0,1,1,3,3,2,0,1,1,2,3,1,0,2,2,1,0,0,1,3,2,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,2,1,3,3,3,3,3,1,2,3,2,3,3,2,1,1,3,2,3,2,1,2,2,0,1,2,1,0,0,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,2,2,2,2,3,1,2,2,1,1,3,3,0,3,2,1,2,3,2,1,3,3,1,1,0,2,1,3,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,2,2,2,3,2,3,3,3,2,1,1,3,3,1,1,1,2,2,3,2,3,2,2,2,1,0,2,2,1,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
1,0,0,3,3,3,3,3,0,0,3,3,2,3,0,0,0,2,3,3,1,0,1,2,0,0,1,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,2,3,3,3,3,3,1,2,3,3,2,2,1,1,0,3,3,2,2,1,2,2,1,0,2,2,0,1,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,2,2,1,3,1,2,3,3,2,2,1,1,2,2,1,1,1,1,3,2,1,1,1,1,2,1,0,1,2,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,1,1,1,1,1,3,3,3,0,1,1,3,3,1,1,1,1,1,2,2,0,3,1,1,2,0,2,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,1,0,1,2,1,2,2,0,1,2,3,1,2,0,0,0,2,1,1,1,1,1,2,0,0,1,1,0,0,0,0,
|
||||
1,2,1,2,2,2,1,2,1,2,0,2,0,2,2,1,1,2,1,1,2,1,1,1,0,1,0,0,0,1,1,0,
|
||||
1,1,1,2,3,2,3,3,0,1,2,2,3,1,0,1,0,2,1,2,2,0,1,1,0,0,1,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,3,3,2,2,1,0,0,3,2,3,2,0,0,0,1,1,3,0,0,1,1,0,0,2,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,1,2,2,3,3,1,0,1,3,2,3,1,1,1,0,1,1,1,1,1,3,1,0,0,2,2,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,1,1,2,2,2,1,0,1,2,3,3,2,0,0,0,2,1,1,1,2,1,1,1,0,1,1,1,0,0,0,
|
||||
1,2,2,2,2,2,1,1,1,2,0,2,1,1,1,1,1,2,1,1,1,1,1,1,0,1,1,1,0,0,1,1,
|
||||
3,2,2,1,0,0,1,1,2,2,0,3,0,1,2,1,1,0,0,1,1,1,0,1,1,1,1,0,2,1,1,1,
|
||||
2,2,1,1,1,2,1,2,1,1,1,1,1,1,1,2,1,1,1,2,3,1,1,1,1,1,1,1,1,1,0,1,
|
||||
2,3,3,0,1,0,0,0,3,3,1,0,0,1,2,2,1,0,0,0,0,2,0,0,1,1,1,0,2,1,1,1,
|
||||
2,1,1,1,1,1,1,2,1,1,0,1,1,0,1,1,1,0,1,2,1,1,0,1,1,1,1,1,1,1,0,1,
|
||||
2,3,3,0,1,0,0,0,2,2,0,0,0,0,1,2,2,0,0,0,0,1,0,0,1,1,0,0,2,0,1,0,
|
||||
2,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,2,0,1,1,1,1,1,0,1,
|
||||
3,2,2,0,1,0,1,0,2,3,2,0,0,1,2,2,1,0,0,1,1,1,0,0,2,1,0,1,2,2,1,1,
|
||||
2,1,1,1,1,1,1,2,1,1,1,1,1,1,0,2,1,0,1,1,0,1,1,1,0,1,1,2,1,1,0,1,
|
||||
2,2,2,0,0,1,0,0,2,2,1,1,0,0,2,1,1,0,0,0,1,2,0,0,2,1,0,0,2,1,1,1,
|
||||
2,1,1,1,1,2,1,2,1,1,1,2,2,1,1,2,1,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1,
|
||||
1,2,3,0,0,0,1,0,3,2,1,0,0,1,2,1,1,0,0,0,0,2,1,0,1,1,0,0,2,1,2,1,
|
||||
1,1,0,0,0,1,0,1,1,1,1,1,2,0,0,1,0,0,0,2,0,0,1,1,1,1,1,1,1,1,0,1,
|
||||
3,0,0,2,1,2,2,1,0,0,2,1,2,2,0,0,0,2,1,1,1,0,1,1,0,0,1,1,2,0,0,0,
|
||||
1,2,1,2,2,1,1,2,1,2,0,1,1,1,1,1,1,1,1,1,2,1,1,0,0,1,1,1,1,0,0,1,
|
||||
1,3,2,0,0,0,1,0,2,2,2,0,0,0,2,2,1,0,0,0,0,3,1,1,1,1,0,0,2,1,1,1,
|
||||
2,1,0,1,1,1,0,1,1,1,1,1,1,1,0,2,1,0,0,1,0,1,1,0,1,1,1,1,1,1,0,1,
|
||||
2,3,2,0,0,0,1,0,2,2,0,0,0,0,2,1,1,0,0,0,0,2,1,0,1,1,0,0,2,1,1,0,
|
||||
2,1,1,1,1,2,1,2,1,2,0,1,1,1,0,2,1,1,1,2,1,1,1,1,0,1,1,1,1,1,0,1,
|
||||
3,1,1,2,2,2,3,2,1,1,2,2,1,1,0,1,0,2,2,1,1,1,1,1,0,0,1,1,0,1,1,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,0,0,0,0,0,2,2,0,0,0,0,2,2,1,0,0,0,1,1,0,0,1,2,0,0,2,1,1,1,
|
||||
2,2,1,1,1,2,1,2,1,1,0,1,1,1,1,2,1,1,1,2,1,1,1,1,0,1,2,1,1,1,0,1,
|
||||
1,0,0,1,2,3,2,1,0,0,2,0,1,1,0,0,0,1,1,1,1,0,1,1,0,0,1,0,0,0,0,0,
|
||||
1,2,1,2,1,2,1,1,1,2,0,2,1,1,1,0,1,2,0,0,1,1,1,0,0,0,0,0,0,0,0,0,
|
||||
2,3,2,0,0,0,0,0,1,1,2,1,0,0,1,1,1,0,0,0,0,2,0,0,1,1,0,0,2,1,1,1,
|
||||
2,1,1,1,1,1,1,2,1,0,1,1,1,1,0,2,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1,
|
||||
1,2,2,0,1,1,1,0,2,2,2,0,0,0,3,2,1,0,0,0,1,1,0,0,1,1,0,1,1,1,0,0,
|
||||
1,1,0,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,0,0,1,1,1,0,1,0,1,
|
||||
2,1,0,2,1,1,2,2,1,1,2,1,1,1,0,0,0,1,1,0,1,1,1,1,0,0,1,1,1,0,0,0,
|
||||
1,2,2,2,2,2,1,1,1,2,0,2,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,1,0,
|
||||
1,2,3,0,0,0,1,0,2,2,0,0,0,0,2,2,0,0,0,0,0,1,0,0,1,0,0,0,2,0,1,0,
|
||||
2,1,1,1,1,1,0,2,0,0,0,1,2,1,1,1,1,0,1,2,0,1,0,1,0,1,1,1,0,1,0,1,
|
||||
2,2,2,0,0,0,1,0,2,1,2,0,0,0,1,1,2,0,0,0,0,1,0,0,1,1,0,0,2,1,0,1,
|
||||
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,0,1,1,1,1,1,0,1,
|
||||
1,2,2,0,0,0,1,0,2,2,2,0,0,0,1,1,0,0,0,0,0,1,1,0,2,0,0,1,1,1,0,1,
|
||||
1,0,1,1,1,1,1,1,0,1,1,1,1,0,0,1,0,0,1,1,0,1,0,1,1,1,1,1,0,0,0,1,
|
||||
1,0,0,1,0,1,2,1,0,0,1,1,1,2,0,0,0,1,1,0,1,0,1,1,0,0,1,0,0,0,0,0,
|
||||
0,2,1,2,1,1,1,1,1,2,0,2,0,1,1,0,1,2,1,0,1,1,1,0,0,0,0,0,0,1,0,0,
|
||||
2,1,1,0,1,2,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,2,1,0,1,
|
||||
2,2,1,1,1,1,1,2,1,1,0,1,1,1,1,2,1,1,1,2,1,1,0,1,0,1,1,1,1,1,0,1,
|
||||
1,2,2,0,0,0,0,0,1,1,0,0,0,0,2,1,0,0,0,0,0,2,0,0,2,2,0,0,2,0,0,1,
|
||||
2,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,
|
||||
1,1,2,0,0,3,1,0,2,1,1,1,0,0,1,1,1,0,0,0,1,1,0,0,0,1,0,0,1,0,1,0,
|
||||
1,2,1,0,1,1,1,2,1,1,0,1,1,1,1,1,0,0,0,1,1,1,1,1,0,1,0,0,0,1,0,0,
|
||||
2,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,2,0,0,0,
|
||||
2,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,2,1,1,0,0,1,1,1,1,1,0,1,
|
||||
2,1,1,1,2,1,1,1,0,1,1,2,1,0,0,0,0,1,1,1,1,0,1,0,0,0,0,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,1,0,1,1,1,1,1,0,0,1,1,2,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,1,0,0,0,
|
||||
1,2,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,0,0,
|
||||
2,0,0,0,1,1,1,1,0,0,1,1,0,0,0,0,0,1,1,1,2,0,0,1,0,0,1,0,1,0,0,0,
|
||||
0,1,1,1,1,1,1,1,1,2,0,1,1,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,1,1,1,1,0,0,2,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,
|
||||
0,1,1,1,1,1,1,0,1,1,0,1,0,1,1,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,1,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
0,1,1,1,1,1,0,0,1,1,0,1,0,1,0,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,1,0,0,0,0,0,0,1,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,1,1,1,0,1,0,0,1,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,
|
||||
2,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,0,1,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
public HungarianModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, HUNGARIAN_LANG_MODEL, 0.947368f,
|
||||
false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Latin2HungarianModel : HungarianModel
|
||||
{
|
||||
private readonly static byte[] LATIN2_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47,
|
||||
46, 71, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253,
|
||||
253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8,
|
||||
23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253,
|
||||
159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,
|
||||
175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,
|
||||
191,192,193,194,195,196,197, 75,198,199,200,201,202,203,204,205,
|
||||
79,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,
|
||||
221, 51, 81,222, 78,223,224,225,226, 44,227,228,229, 61,230,231,
|
||||
232,233,234, 58,235, 66, 59,236,237,238, 60, 69, 63,239,240,241,
|
||||
82, 14, 74,242, 70, 80,243, 72,244, 15, 83, 77, 84, 30, 76, 85,
|
||||
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
||||
};
|
||||
|
||||
public Latin2HungarianModel() : base(LATIN2_CHAR_TO_ORDER_MAP, "ISO-8859-2")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Win1250HungarianModel : HungarianModel
|
||||
{
|
||||
private readonly static byte[] WIN1250_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47,
|
||||
46, 72, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253,
|
||||
253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8,
|
||||
23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253,
|
||||
161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,
|
||||
177,178,179,180, 78,181, 69,182,183,184,185,186,187,188,189,190,
|
||||
191,192,193,194,195,196,197, 76,198,199,200,201,202,203,204,205,
|
||||
81,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,
|
||||
221, 51, 83,222, 80,223,224,225,226, 44,227,228,229, 61,230,231,
|
||||
232,233,234, 58,235, 66, 59,236,237,238, 60, 70, 63,239,240,241,
|
||||
84, 14, 75,242, 71, 82,243, 73,244, 15, 85, 79, 86, 30, 77, 87,
|
||||
245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
||||
};
|
||||
|
||||
public Win1250HungarianModel() : base(WIN1250_CHAR_TO_ORDER_MAP, "windows-1250")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,213 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class ThaiModel : SequenceModel
|
||||
{
|
||||
/****************************************************************
|
||||
255: Control characters that usually does not exist in any text
|
||||
254: Carriage/Return
|
||||
253: symbol (punctuation) that does not belong to word
|
||||
252: 0 - 9
|
||||
*****************************************************************/
|
||||
// The following result for thai was collected from a limited sample (1M)
|
||||
private readonly static byte[] TIS620_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253,182,106,107,100,183,184,185,101, 94,186,187,108,109,110,111, //40
|
||||
188,189,190, 89, 95,112,113,191,192,193,194,253,253,253,253,253, //50
|
||||
253, 64, 72, 73,114, 74,115,116,102, 81,201,117, 90,103, 78, 82, //60
|
||||
96,202, 91, 79, 84,104,105, 97, 98, 92,203,253,253,253,253,253, //70
|
||||
209,210,211,212,213, 88,214,215,216,217,218,219,220,118,221,222,
|
||||
223,224, 99, 85, 83,225,226,227,228,229,230,231,232,233,234,235,
|
||||
236, 5, 30,237, 24,238, 75, 8, 26, 52, 34, 51,119, 47, 58, 57,
|
||||
49, 53, 55, 43, 20, 19, 44, 14, 48, 3, 17, 25, 39, 62, 31, 54,
|
||||
45, 9, 16, 2, 61, 15,239, 12, 42, 46, 18, 21, 76, 4, 66, 63,
|
||||
22, 10, 1, 36, 23, 13, 40, 27, 32, 35, 86,240,241,242,243,244,
|
||||
11, 28, 41, 29, 33,245, 50, 37, 6, 7, 67, 77, 38, 93,246,247,
|
||||
68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253,
|
||||
};
|
||||
|
||||
//Model Table:
|
||||
//total sequences: 100%
|
||||
//first 512 sequences: 92.6386%
|
||||
//first 1024 sequences:7.3177%
|
||||
//rest sequences: 1.0230%
|
||||
//negative sequences: 0.0436%
|
||||
private readonly static byte[] THAI_LANG_MODEL = {
|
||||
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
|
||||
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
|
||||
3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3,
|
||||
0,2,3,0,0,0,0,1,0,1,2,3,1,1,3,2,2,0,1,1,0,0,1,0,0,0,0,0,0,0,1,1,
|
||||
3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,3,3,2,3,2,3,3,2,2,2,
|
||||
3,1,2,3,0,3,3,2,2,1,2,3,3,1,2,0,1,3,0,1,0,0,1,0,0,0,0,0,0,0,1,1,
|
||||
3,3,2,2,3,3,3,3,1,2,3,3,3,3,3,2,2,2,2,3,3,2,2,3,3,2,2,3,2,3,2,2,
|
||||
3,3,1,2,3,1,2,2,3,3,1,0,2,1,0,0,3,1,2,1,0,0,1,0,0,0,0,0,0,1,0,1,
|
||||
3,3,3,3,3,3,2,2,3,3,3,3,2,3,2,2,3,3,2,2,3,2,2,2,2,1,1,3,1,2,1,1,
|
||||
3,2,1,0,2,1,0,1,0,1,1,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,2,3,2,3,3,2,2,3,2,3,3,2,3,1,1,2,3,2,2,2,3,2,2,2,2,2,1,2,1,
|
||||
2,2,1,1,3,3,2,1,0,1,2,2,0,1,3,0,0,0,1,1,0,0,0,0,0,2,3,0,0,2,1,1,
|
||||
3,3,2,3,3,2,0,0,3,3,0,3,3,0,2,2,3,1,2,2,1,1,1,0,2,2,2,0,2,2,1,1,
|
||||
0,2,1,0,2,0,0,2,0,1,0,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,2,3,3,2,0,0,3,3,0,2,3,0,2,1,2,2,2,2,1,2,0,0,2,2,2,0,2,2,1,1,
|
||||
0,2,1,0,2,0,0,2,0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
3,3,2,3,2,3,2,0,2,2,1,3,2,1,3,2,1,2,3,2,2,3,0,2,3,2,2,1,2,2,2,2,
|
||||
1,2,2,0,0,0,0,2,0,1,2,0,1,1,1,0,1,0,3,1,1,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,2,3,3,2,3,2,2,2,3,2,2,3,2,2,1,2,3,2,2,3,1,3,2,2,2,3,2,2,2,3,
|
||||
3,2,1,3,0,1,1,1,0,2,1,1,1,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,2,0,0,
|
||||
1,0,0,3,0,3,3,3,3,3,0,0,3,0,2,2,3,3,3,3,3,0,0,0,1,1,3,0,0,0,0,2,
|
||||
0,0,1,0,0,0,0,0,0,0,2,3,0,0,0,3,0,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,
|
||||
2,0,3,3,3,3,0,0,2,3,0,0,3,0,3,3,2,3,3,3,3,3,0,0,3,3,3,0,0,0,3,3,
|
||||
0,0,3,0,0,0,0,2,0,0,2,1,1,3,0,0,1,0,0,2,3,0,1,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,2,3,3,3,3,3,3,3,1,2,1,3,3,2,2,1,2,2,2,3,1,1,2,0,2,1,2,1,
|
||||
2,2,1,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
3,0,2,1,2,3,3,3,0,2,0,2,2,0,2,1,3,2,2,1,2,1,0,0,2,2,1,0,2,1,2,2,
|
||||
0,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,2,1,3,3,1,1,3,0,2,3,1,1,3,2,1,1,2,0,2,2,3,2,1,1,1,1,1,2,
|
||||
3,0,0,1,3,1,2,1,2,0,3,0,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
|
||||
3,3,1,1,3,2,3,3,3,1,3,2,1,3,2,1,3,2,2,2,2,1,3,3,1,2,1,3,1,2,3,0,
|
||||
2,1,1,3,2,2,2,1,2,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
|
||||
3,3,2,3,2,3,3,2,3,2,3,2,3,3,2,1,0,3,2,2,2,1,2,2,2,1,2,2,1,2,1,1,
|
||||
2,2,2,3,0,1,3,1,1,1,1,0,1,1,0,2,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,2,3,2,2,1,1,3,2,3,2,3,2,0,3,2,2,1,2,0,2,2,2,1,2,2,2,2,1,
|
||||
3,2,1,2,2,1,0,2,0,1,0,0,1,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,2,3,1,2,3,3,2,2,3,0,1,1,2,0,3,3,2,2,3,0,1,1,3,0,0,0,0,
|
||||
3,1,0,3,3,0,2,0,2,1,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,2,3,2,3,3,0,1,3,1,1,2,1,2,1,1,3,1,1,0,2,3,1,1,1,1,1,1,1,1,
|
||||
3,1,1,2,2,2,2,1,1,1,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,2,2,1,1,2,1,3,3,2,3,2,2,3,2,2,3,1,2,2,1,2,0,3,2,1,2,2,2,2,2,1,
|
||||
3,2,1,2,2,2,1,1,1,1,0,0,1,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,1,3,3,0,2,1,0,3,2,0,0,3,1,0,1,1,0,1,0,0,0,0,0,1,
|
||||
1,0,0,1,0,3,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,2,2,2,3,0,0,1,3,0,3,2,0,3,2,2,3,3,3,3,3,1,0,2,2,2,0,2,2,1,2,
|
||||
0,2,3,0,0,0,0,1,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,0,2,3,1,3,3,2,3,3,0,3,3,0,3,2,2,3,2,3,3,3,0,0,2,2,3,0,1,1,1,3,
|
||||
0,0,3,0,0,0,2,2,0,1,3,0,1,2,2,2,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,
|
||||
3,2,3,3,2,0,3,3,2,2,3,1,3,2,1,3,2,0,1,2,2,0,2,3,2,1,0,3,0,0,0,0,
|
||||
3,0,0,2,3,1,3,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,3,2,2,2,1,2,0,1,3,1,1,3,1,3,0,0,2,1,1,1,1,2,1,1,1,0,2,1,0,1,
|
||||
1,2,0,0,0,3,1,1,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,3,1,0,0,0,1,0,
|
||||
3,3,3,3,2,2,2,2,2,1,3,1,1,1,2,0,1,1,2,1,2,1,3,2,0,0,3,1,1,1,1,1,
|
||||
3,1,0,2,3,0,0,0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,2,3,0,3,3,0,2,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,2,3,1,3,0,0,1,2,0,0,2,0,3,3,2,3,3,3,2,3,0,0,2,2,2,0,0,0,2,2,
|
||||
0,0,1,0,0,0,0,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,1,2,3,1,3,3,0,0,1,0,3,0,0,0,0,0,
|
||||
0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,1,2,3,1,2,3,1,0,3,0,2,2,1,0,2,1,1,2,0,1,0,0,1,1,1,1,0,1,0,0,
|
||||
1,0,0,0,0,1,1,0,3,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,2,1,0,1,1,1,3,1,2,2,2,2,2,2,1,1,1,1,0,3,1,0,1,3,1,1,1,1,
|
||||
1,1,0,2,0,1,3,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,
|
||||
3,0,2,2,1,3,3,2,3,3,0,1,1,0,2,2,1,2,1,3,3,1,0,0,3,2,0,0,0,0,2,1,
|
||||
0,1,0,0,0,0,1,2,0,1,1,3,1,1,2,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,0,3,0,0,1,0,0,0,3,0,0,3,0,3,1,0,1,1,1,3,2,0,0,0,3,0,0,0,0,2,0,
|
||||
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
3,3,1,3,2,1,3,3,1,2,2,0,1,2,1,0,1,2,0,0,0,0,0,3,0,0,0,3,0,0,0,0,
|
||||
3,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,1,2,0,3,3,3,2,2,0,1,1,0,1,3,0,0,0,2,2,0,0,0,0,3,1,0,1,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,2,3,1,2,0,0,2,1,0,3,1,0,1,2,0,1,1,1,1,3,0,0,3,1,1,0,2,2,1,1,
|
||||
0,2,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,0,3,1,2,0,0,2,2,0,1,2,0,1,0,1,3,1,2,1,0,0,0,2,0,3,0,0,0,1,0,
|
||||
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,1,1,2,2,0,0,0,2,0,2,1,0,1,1,0,1,1,1,2,1,0,0,1,1,1,0,2,1,1,1,
|
||||
0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,
|
||||
0,0,0,2,0,1,3,1,1,1,1,0,0,0,0,3,2,0,1,0,0,0,1,2,0,0,0,1,0,0,0,0,
|
||||
0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,2,3,2,2,0,0,0,1,0,0,0,0,2,3,2,1,2,2,3,0,0,0,2,3,1,0,0,0,1,1,
|
||||
0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,
|
||||
3,3,2,2,0,1,0,0,0,0,2,0,2,0,1,0,0,0,1,1,0,0,0,2,1,0,1,0,1,1,0,0,
|
||||
0,1,0,2,0,0,1,0,3,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,1,0,0,1,0,0,0,0,0,1,1,2,0,0,0,0,1,0,0,1,3,1,0,0,0,0,1,1,0,0,
|
||||
0,1,0,0,0,0,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,1,1,1,1,2,3,0,0,2,1,1,1,1,1,0,2,1,1,0,0,0,2,1,0,1,2,1,1,0,1,
|
||||
2,1,0,3,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,3,1,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,
|
||||
0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,2,0,0,0,0,0,0,1,2,1,0,1,1,0,2,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,2,0,0,0,1,3,0,1,0,0,0,2,0,0,0,0,0,0,0,1,2,0,0,0,0,0,
|
||||
3,3,0,0,1,1,2,0,0,1,2,1,0,1,1,1,0,1,1,0,0,2,1,1,0,1,0,0,1,1,1,0,
|
||||
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,1,0,0,0,0,1,0,0,0,0,3,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,
|
||||
2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,0,0,1,1,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,1,0,1,2,0,1,2,0,0,1,1,0,2,0,1,0,0,1,0,0,0,0,1,0,0,0,2,0,0,0,0,
|
||||
1,0,0,1,0,1,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,2,1,3,0,0,0,0,1,1,0,0,0,0,0,0,0,3,
|
||||
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,0,1,0,1,0,0,2,0,0,2,0,0,1,1,2,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0,
|
||||
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,1,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,3,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,1,0,0,2,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
public ThaiModel(byte[] charToOrderMap, string name)
|
||||
: base(TIS620_CHAR_TO_ORDER_MAP, THAI_LANG_MODEL,
|
||||
0.926386f, false, "TIS-620")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,180 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
// TODO: Using trigrams the detector should be able to discriminate between
|
||||
// latin-1 and iso8859-2
|
||||
public class Latin1Prober : CharsetProber
|
||||
{
|
||||
private const int FREQ_CAT_NUM = 4;
|
||||
|
||||
private const int UDF = 0; // undefined
|
||||
private const int OTH = 1; // other
|
||||
private const int ASC = 2; // ascii capital letter
|
||||
private const int ASS = 3; // ascii small letter
|
||||
private const int ACV = 4; // accent capital vowel
|
||||
private const int ACO = 5; // accent capital other
|
||||
private const int ASV = 6; // accent small vowel
|
||||
private const int ASO = 7; // accent small other
|
||||
|
||||
private const int CLASS_NUM = 8; // total classes
|
||||
|
||||
private readonly static byte[] Latin1_CharToClass = {
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
|
||||
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
|
||||
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
|
||||
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
|
||||
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
|
||||
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
|
||||
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
|
||||
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
|
||||
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
|
||||
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
|
||||
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
|
||||
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
|
||||
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
|
||||
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
|
||||
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
|
||||
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
|
||||
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
|
||||
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
|
||||
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
|
||||
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
|
||||
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
|
||||
};
|
||||
|
||||
/* 0 : illegal
|
||||
1 : very unlikely
|
||||
2 : normal
|
||||
3 : very likely
|
||||
*/
|
||||
private readonly static byte[] Latin1ClassModel = {
|
||||
/* UDF OTH ASC ASS ACV ACO ASV ASO */
|
||||
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
/*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
||||
/*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
||||
/*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
|
||||
/*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
|
||||
/*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
||||
/*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
|
||||
/*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
|
||||
};
|
||||
|
||||
private byte lastCharClass;
|
||||
private int[] freqCounter = new int[FREQ_CAT_NUM];
|
||||
|
||||
public Latin1Prober()
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "windows-1252";
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
state = ProbingState.Detecting;
|
||||
lastCharClass = OTH;
|
||||
for (int i = 0; i < FREQ_CAT_NUM; i++)
|
||||
freqCounter[i] = 0;
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
byte[] newbuf = FilterWithEnglishLetters(buf, offset, len);
|
||||
byte charClass, freq;
|
||||
|
||||
for (int i = 0; i < newbuf.Length; i++) {
|
||||
charClass = Latin1_CharToClass[newbuf[i]];
|
||||
freq = Latin1ClassModel[lastCharClass * CLASS_NUM + charClass];
|
||||
if (freq == 0) {
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
freqCounter[freq]++;
|
||||
lastCharClass = charClass;
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
if (state == ProbingState.NotMe)
|
||||
return 0.01f;
|
||||
|
||||
float confidence = 0.0f;
|
||||
int total = 0;
|
||||
for (int i = 0; i < FREQ_CAT_NUM; i++) {
|
||||
total += freqCounter[i];
|
||||
}
|
||||
|
||||
if (total <= 0) {
|
||||
confidence = 0.0f;
|
||||
} else {
|
||||
confidence = freqCounter[3] * 1.0f / total;
|
||||
confidence -= freqCounter[1] * 20.0f / total;
|
||||
}
|
||||
|
||||
// lower the confidence of latin1 so that other more accurate detector
|
||||
// can take priority.
|
||||
return confidence < 0.0f ? 0.0f : confidence * 0.5f;
|
||||
}
|
||||
|
||||
public override void DumpStatus()
|
||||
{
|
||||
//Console.WriteLine(" Latin1Prober: {0} [{1}]", GetConfidence(), GetCharsetName());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,175 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
/// <summary>
|
||||
/// Multi-byte charsets probers
|
||||
/// </summary>
|
||||
public class MBCSGroupProber : CharsetProber
|
||||
{
|
||||
private const int PROBERS_NUM = 7;
|
||||
private readonly static string[] ProberName =
|
||||
{ "UTF8", "SJIS", "EUCJP", "GB18030", "EUCKR", "Big5", "EUCTW" };
|
||||
private CharsetProber[] probers = new CharsetProber[PROBERS_NUM];
|
||||
private bool[] isActive = new bool[PROBERS_NUM];
|
||||
private int bestGuess;
|
||||
private int activeNum;
|
||||
|
||||
public MBCSGroupProber()
|
||||
{
|
||||
probers[0] = new UTF8Prober();
|
||||
probers[1] = new SJISProber();
|
||||
probers[2] = new EUCJPProber();
|
||||
probers[3] = new GB18030Prober();
|
||||
probers[4] = new EUCKRProber();
|
||||
probers[5] = new Big5Prober();
|
||||
probers[6] = new EUCTWProber();
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
if (bestGuess == -1) {
|
||||
GetConfidence();
|
||||
if (bestGuess == -1)
|
||||
bestGuess = 0;
|
||||
}
|
||||
return probers[bestGuess].GetCharsetName();
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
activeNum = 0;
|
||||
for (int i = 0; i < probers.Length; i++) {
|
||||
if (probers[i] != null) {
|
||||
probers[i].Reset();
|
||||
isActive[i] = true;
|
||||
++activeNum;
|
||||
} else {
|
||||
isActive[i] = false;
|
||||
}
|
||||
}
|
||||
bestGuess = -1;
|
||||
state = ProbingState.Detecting;
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
// do filtering to reduce load to probers
|
||||
byte[] highbyteBuf = new byte[len];
|
||||
int hptr = 0;
|
||||
//assume previous is not ascii, it will do no harm except add some noise
|
||||
bool keepNext = true;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
if ((buf[i] & 0x80) != 0) {
|
||||
highbyteBuf[hptr++] = buf[i];
|
||||
keepNext = true;
|
||||
} else {
|
||||
//if previous is highbyte, keep this even it is a ASCII
|
||||
if (keepNext) {
|
||||
highbyteBuf[hptr++] = buf[i];
|
||||
keepNext = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ProbingState st = ProbingState.NotMe;
|
||||
|
||||
for (int i = 0; i < probers.Length; i++) {
|
||||
if (!isActive[i])
|
||||
continue;
|
||||
st = probers[i].HandleData(highbyteBuf, 0, hptr);
|
||||
if (st == ProbingState.FoundIt) {
|
||||
bestGuess = i;
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
} else if (st == ProbingState.NotMe) {
|
||||
isActive[i] = false;
|
||||
activeNum--;
|
||||
if (activeNum <= 0) {
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
float bestConf = 0.0f;
|
||||
float cf = 0.0f;
|
||||
|
||||
if (state == ProbingState.FoundIt) {
|
||||
return 0.99f;
|
||||
} else if (state == ProbingState.NotMe) {
|
||||
return 0.01f;
|
||||
} else {
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
if (!isActive[i])
|
||||
continue;
|
||||
cf = probers[i].GetConfidence();
|
||||
if (bestConf < cf) {
|
||||
bestConf = cf;
|
||||
bestGuess = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
return bestConf;
|
||||
}
|
||||
|
||||
public override void DumpStatus()
|
||||
{
|
||||
float cf;
|
||||
GetConfidence();
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
if (!isActive[i]) {
|
||||
//Console.WriteLine(" MBCS inactive: {0} (confidence is too low).", ProberName[i]);
|
||||
} else {
|
||||
cf = probers[i].GetConfidence();
|
||||
//Console.WriteLine(" MBCS {0}: [{1}]", cf, ProberName[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,640 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class UTF8SMModel : SMModel
|
||||
{
|
||||
private readonly static int[] UTF8_cls = {
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
|
||||
BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 40 - 47
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 48 - 4f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 50 - 57
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 58 - 5f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 60 - 67
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 68 - 6f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 70 - 77
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 78 - 7f
|
||||
BitPackage.Pack4bits(2,2,2,2,3,3,3,3), // 80 - 87
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 88 - 8f
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 90 - 97
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 98 - 9f
|
||||
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // a0 - a7
|
||||
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // a8 - af
|
||||
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // b0 - b7
|
||||
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // b8 - bf
|
||||
BitPackage.Pack4bits(0,0,6,6,6,6,6,6), // c0 - c7
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c8 - cf
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d0 - d7
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d8 - df
|
||||
BitPackage.Pack4bits(7,8,8,8,8,8,8,8), // e0 - e7
|
||||
BitPackage.Pack4bits(8,8,8,8,8,9,8,8), // e8 - ef
|
||||
BitPackage.Pack4bits(10,11,11,11,11,11,11,11), // f0 - f7
|
||||
BitPackage.Pack4bits(12,13,13,13,14,15,0,0) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] UTF8_st = {
|
||||
BitPackage.Pack4bits(ERROR,START,ERROR,ERROR,ERROR,ERROR, 12, 10),//00-07
|
||||
BitPackage.Pack4bits( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//10-17
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME),//20-27
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME),//28-2f
|
||||
BitPackage.Pack4bits(ERROR,ERROR, 5, 5, 5, 5,ERROR,ERROR),//30-37
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//38-3f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR, 5, 5, 5,ERROR,ERROR),//40-47
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//48-4f
|
||||
BitPackage.Pack4bits(ERROR,ERROR, 7, 7, 7, 7,ERROR,ERROR),//50-57
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//58-5f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR, 7, 7,ERROR,ERROR),//60-67
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//68-6f
|
||||
BitPackage.Pack4bits(ERROR,ERROR, 9, 9, 9, 9,ERROR,ERROR),//70-77
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//78-7f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR, 9,ERROR,ERROR),//80-87
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//88-8f
|
||||
BitPackage.Pack4bits(ERROR,ERROR, 12, 12, 12, 12,ERROR,ERROR),//90-97
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//98-9f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR, 12,ERROR,ERROR),//a0-a7
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//a8-af
|
||||
BitPackage.Pack4bits(ERROR,ERROR, 12, 12, 12,ERROR,ERROR,ERROR),//b0-b7
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//b8-bf
|
||||
BitPackage.Pack4bits(ERROR,ERROR,START,START,START,START,ERROR,ERROR),//c0-c7
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR) //c8-cf
|
||||
};
|
||||
|
||||
private readonly static int[] UTF8CharLenTable =
|
||||
{0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6 };
|
||||
|
||||
public UTF8SMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, UTF8_cls),
|
||||
16,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, UTF8_st),
|
||||
UTF8CharLenTable, "UTF-8")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public class GB18030SMModel : SMModel
|
||||
{
|
||||
private readonly static int[] GB18030_cls = {
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
|
||||
BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 30 - 37
|
||||
BitPackage.Pack4bits(3,3,1,1,1,1,1,1), // 38 - 3f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,4), // 78 - 7f
|
||||
BitPackage.Pack4bits(5,6,6,6,6,6,6,6), // 80 - 87
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 88 - 8f
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 90 - 97
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // 98 - 9f
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // a0 - a7
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // a8 - af
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // b0 - b7
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // b8 - bf
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c0 - c7
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // c8 - cf
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d0 - d7
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // d8 - df
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // e0 - e7
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // e8 - ef
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,6), // f0 - f7
|
||||
BitPackage.Pack4bits(6,6,6,6,6,6,6,0) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] GB18030_st = {
|
||||
BitPackage.Pack4bits(ERROR,START,START,START,START,START, 3,ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ERROR,ERROR,START),//10-17
|
||||
BitPackage.Pack4bits( 4,ERROR,START,START,ERROR,ERROR,ERROR,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ERROR,ERROR, 5,ERROR,ERROR,ERROR,ITSME,ERROR),//20-27
|
||||
BitPackage.Pack4bits(ERROR,ERROR,START,START,START,START,START,START) //28-2f
|
||||
};
|
||||
|
||||
// To be accurate, the length of class 6 can be either 2 or 4.
|
||||
// But it is not necessary to discriminate between the two since
|
||||
// it is used for frequency analysis only, and we are validating
|
||||
// each code range there as well. So it is safe to set it to be
|
||||
// 2 here.
|
||||
private readonly static int[] GB18030CharLenTable = {0, 1, 1, 1, 1, 1, 2};
|
||||
|
||||
public GB18030SMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, GB18030_cls),
|
||||
7,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, GB18030_st),
|
||||
GB18030CharLenTable, "GB18030")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public class BIG5SMModel : SMModel
|
||||
{
|
||||
private readonly static int[] BIG5_cls = {
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
|
||||
BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,1), // 78 - 7f
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 80 - 87
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 88 - 8f
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 90 - 97
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 98 - 9f
|
||||
BitPackage.Pack4bits(4,3,3,3,3,3,3,3), // a0 - a7
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // a8 - af
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // b0 - b7
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // b8 - bf
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c0 - c7
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c8 - cf
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d0 - d7
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d8 - df
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e8 - ef
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // f0 - f7
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,0) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] BIG5_st = {
|
||||
BitPackage.Pack4bits(ERROR,START,START, 3,ERROR,ERROR,ERROR,ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ERROR),//08-0f
|
||||
BitPackage.Pack4bits(ERROR,START,START,START,START,START,START,START) //10-17
|
||||
};
|
||||
|
||||
private readonly static int[] BIG5CharLenTable = {0, 1, 1, 2, 0};
|
||||
|
||||
public BIG5SMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, BIG5_cls),
|
||||
5,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, BIG5_st),
|
||||
BIG5CharLenTable, "Big5")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public class EUCJPSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] EUCJP_cls = {
|
||||
//BitPacket.Pack4bits(5,4,4,4,4,4,4,4), // 00 - 07
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 00 - 07
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,5,5), // 08 - 0f
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 10 - 17
|
||||
BitPackage.Pack4bits(4,4,4,5,4,4,4,4), // 18 - 1f
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 20 - 27
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 28 - 2f
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 30 - 37
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 38 - 3f
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 40 - 47
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 48 - 4f
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 50 - 57
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 58 - 5f
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 60 - 67
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 68 - 6f
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 70 - 77
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // 78 - 7f
|
||||
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 80 - 87
|
||||
BitPackage.Pack4bits(5,5,5,5,5,5,1,3), // 88 - 8f
|
||||
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 90 - 97
|
||||
BitPackage.Pack4bits(5,5,5,5,5,5,5,5), // 98 - 9f
|
||||
BitPackage.Pack4bits(5,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,5) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] EUCJP_st = {
|
||||
BitPackage.Pack4bits( 3, 4, 3, 5,START,ERROR,ERROR,ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME,ITSME,START,ERROR,START,ERROR,ERROR,ERROR),//10-17
|
||||
BitPackage.Pack4bits(ERROR,ERROR,START,ERROR,ERROR,ERROR, 3,ERROR),//18-1f
|
||||
BitPackage.Pack4bits( 3,ERROR,ERROR,ERROR,START,START,START,START) //20-27
|
||||
};
|
||||
|
||||
private readonly static int[] EUCJPCharLenTable = { 2, 2, 2, 3, 1, 0 };
|
||||
|
||||
public EUCJPSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, EUCJP_cls),
|
||||
6,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, EUCJP_st),
|
||||
EUCJPCharLenTable, "EUC-JP")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public class EUCKRSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] EUCKR_cls = {
|
||||
//BitPacket.Pack4bits(0,1,1,1,1,1,1,1), // 00 - 07
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
|
||||
BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 40 - 47
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 48 - 4f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 50 - 57
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 58 - 5f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 60 - 67
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 68 - 6f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 70 - 77
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 78 - 7f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
|
||||
BitPackage.Pack4bits(0,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,3,3,3), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,3,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,0) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] EUCKR_st = {
|
||||
BitPackage.Pack4bits(ERROR,START, 3,ERROR,ERROR,ERROR,ERROR,ERROR),//00-07
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ERROR,ERROR,START,START) //08-0f
|
||||
};
|
||||
|
||||
private readonly static int[] EUCKRCharLenTable = { 0, 1, 2, 0 };
|
||||
|
||||
public EUCKRSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, EUCKR_cls),
|
||||
4,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, EUCKR_st),
|
||||
EUCKRCharLenTable, "EUC-KR")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public class EUCTWSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] EUCTW_cls = {
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 00 - 07
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 10 - 17
|
||||
BitPackage.Pack4bits(2,2,2,0,2,2,2,2), // 18 - 1f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 20 - 27
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 28 - 2f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 30 - 37
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 38 - 3f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 78 - 7f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,6,0), // 88 - 8f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
|
||||
BitPackage.Pack4bits(0,3,4,4,4,4,4,4), // a0 - a7
|
||||
BitPackage.Pack4bits(5,5,1,1,1,1,1,1), // a8 - af
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b0 - b7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b8 - bf
|
||||
BitPackage.Pack4bits(1,1,3,1,3,3,3,3), // c0 - c7
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // c8 - cf
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d0 - d7
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // d8 - df
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e8 - ef
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // f0 - f7
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,0) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] EUCTW_st = {
|
||||
BitPackage.Pack4bits(ERROR,ERROR,START, 3, 3, 3, 4,ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ITSME,ITSME,ERROR,START,ERROR),//10-17
|
||||
BitPackage.Pack4bits(START,START,START,ERROR,ERROR,ERROR,ERROR,ERROR),//18-1f
|
||||
BitPackage.Pack4bits( 5,ERROR,ERROR,ERROR,START,ERROR,START,START),//20-27
|
||||
BitPackage.Pack4bits(START,ERROR,START,START,START,START,START,START) //28-2f
|
||||
};
|
||||
|
||||
private readonly static int[] EUCTWCharLenTable = { 0, 0, 1, 2, 2, 2, 3 };
|
||||
|
||||
public EUCTWSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, EUCTW_cls),
|
||||
7,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, EUCTW_st),
|
||||
EUCTWCharLenTable, "EUC-TW")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public class SJISSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] SJIS_cls = {
|
||||
//BitPacket.Pack4bits(0,1,1,1,1,1,1,1), // 00 - 07
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 00 - 07
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 10 - 17
|
||||
BitPackage.Pack4bits(1,1,1,0,1,1,1,1), // 18 - 1f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 20 - 27
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 28 - 2f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 30 - 37
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 38 - 3f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 40 - 47
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 48 - 4f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 50 - 57
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 58 - 5f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 60 - 67
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 68 - 6f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 70 - 77
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,1), // 78 - 7f
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 80 - 87
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 88 - 8f
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 90 - 97
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // 98 - 9f
|
||||
//0xa0 is illegal in sjis encoding, but some pages does
|
||||
//contain such byte. We need to be more error forgiven.
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(3,3,3,3,3,3,3,3), // e0 - e7
|
||||
BitPackage.Pack4bits(3,3,3,3,3,4,4,4), // e8 - ef
|
||||
BitPackage.Pack4bits(4,4,4,4,4,4,4,4), // f0 - f7
|
||||
BitPackage.Pack4bits(4,4,4,4,4,0,0,0) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] SJIS_st = {
|
||||
BitPackage.Pack4bits(ERROR,START,START, 3,ERROR,ERROR,ERROR,ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,START,START,START,START) //10-17
|
||||
};
|
||||
|
||||
private readonly static int[] SJISCharLenTable = { 0, 1, 1, 2, 0, 0 };
|
||||
|
||||
public SJISSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, SJIS_cls),
|
||||
6,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, SJIS_st),
|
||||
SJISCharLenTable, "Shift_JIS")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public class UCS2BESMModel : SMModel
|
||||
{
|
||||
private readonly static int[] UCS2BE_cls = {
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,1,0,0,2,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,3,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,3,3,3,3,3,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a0 - a7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a8 - af
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b0 - b7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b8 - bf
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c0 - c7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c8 - cf
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d0 - d7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d8 - df
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,4,5) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] UCS2BE_st = {
|
||||
BitPackage.Pack4bits( 5, 7, 7,ERROR, 4, 3,ERROR,ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME,ITSME, 6, 6, 6, 6,ERROR,ERROR),//10-17
|
||||
BitPackage.Pack4bits( 6, 6, 6, 6, 6,ITSME, 6, 6),//18-1f
|
||||
BitPackage.Pack4bits( 6, 6, 6, 6, 5, 7, 7,ERROR),//20-27
|
||||
BitPackage.Pack4bits( 5, 8, 6, 6,ERROR, 6, 6, 6),//28-2f
|
||||
BitPackage.Pack4bits( 6, 6, 6, 6,ERROR,ERROR,START,START) //30-37
|
||||
};
|
||||
|
||||
private readonly static int[] UCS2BECharLenTable = { 2, 2, 2, 0, 2, 2 };
|
||||
|
||||
public UCS2BESMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, UCS2BE_cls),
|
||||
6,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, UCS2BE_st),
|
||||
UCS2BECharLenTable, "UTF-16BE")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public class UCS2LESMModel : SMModel
|
||||
{
|
||||
private readonly static int[] UCS2LE_cls = {
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,1,0,0,2,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,3,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,3,3,3,3,3,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 80 - 87
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 88 - 8f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 90 - 97
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 98 - 9f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a0 - a7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // a8 - af
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b0 - b7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // b8 - bf
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c0 - c7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // c8 - cf
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d0 - d7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // d8 - df
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e0 - e7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // e8 - ef
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // f0 - f7
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,4,5) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] UCS2LE_st = {
|
||||
BitPackage.Pack4bits( 6, 6, 7, 6, 4, 3,ERROR,ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME,ITSME, 5, 5, 5,ERROR,ITSME,ERROR),//10-17
|
||||
BitPackage.Pack4bits( 5, 5, 5,ERROR, 5,ERROR, 6, 6),//18-1f
|
||||
BitPackage.Pack4bits( 7, 6, 8, 8, 5, 5, 5,ERROR),//20-27
|
||||
BitPackage.Pack4bits( 5, 5, 5,ERROR,ERROR,ERROR, 5, 5),//28-2f
|
||||
BitPackage.Pack4bits( 5, 5, 5,ERROR, 5,ERROR,START,START) //30-37
|
||||
};
|
||||
|
||||
private readonly static int[] UCS2LECharLenTable = { 2, 2, 2, 2, 2, 2 };
|
||||
|
||||
public UCS2LESMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, UCS2LE_cls),
|
||||
6,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, UCS2LE_st),
|
||||
UCS2LECharLenTable, "UTF-16LE")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,180 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class SBCSGroupProber : CharsetProber
|
||||
{
|
||||
private const int PROBERS_NUM = 13;
|
||||
private CharsetProber[] probers = new CharsetProber[PROBERS_NUM];
|
||||
private bool[] isActive = new bool[PROBERS_NUM];
|
||||
private int bestGuess;
|
||||
private int activeNum;
|
||||
|
||||
public SBCSGroupProber()
|
||||
{
|
||||
probers[0] = new SingleByteCharSetProber(new Win1251Model());
|
||||
probers[1] = new SingleByteCharSetProber(new Koi8rModel());
|
||||
probers[2] = new SingleByteCharSetProber(new Latin5Model());
|
||||
probers[3] = new SingleByteCharSetProber(new MacCyrillicModel());
|
||||
probers[4] = new SingleByteCharSetProber(new Ibm866Model());
|
||||
probers[5] = new SingleByteCharSetProber(new Ibm855Model());
|
||||
probers[6] = new SingleByteCharSetProber(new Latin7Model());
|
||||
probers[7] = new SingleByteCharSetProber(new Win1253Model());
|
||||
probers[8] = new SingleByteCharSetProber(new Latin5BulgarianModel());
|
||||
probers[9] = new SingleByteCharSetProber(new Win1251BulgarianModel());
|
||||
HebrewProber hebprober = new HebrewProber();
|
||||
probers[10] = hebprober;
|
||||
// Logical
|
||||
probers[11] = new SingleByteCharSetProber(new Win1255Model(), false, hebprober);
|
||||
// Visual
|
||||
probers[12] = new SingleByteCharSetProber(new Win1255Model(), true, hebprober);
|
||||
hebprober.SetModelProbers(probers[11], probers[12]);
|
||||
// disable latin2 before latin1 is available, otherwise all latin1
|
||||
// will be detected as latin2 because of their similarity.
|
||||
//probers[13] = new SingleByteCharSetProber(new Latin2HungarianModel());
|
||||
//probers[14] = new SingleByteCharSetProber(new Win1250HungarianModel());
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
ProbingState st = ProbingState.NotMe;
|
||||
|
||||
//apply filter to original buffer, and we got new buffer back
|
||||
//depend on what script it is, we will feed them the new buffer
|
||||
//we got after applying proper filter
|
||||
//this is done without any consideration to KeepEnglishLetters
|
||||
//of each prober since as of now, there are no probers here which
|
||||
//recognize languages with English characters.
|
||||
byte[] newBuf = FilterWithoutEnglishLetters(buf, offset, len);
|
||||
if (newBuf.Length == 0)
|
||||
return state; // Nothing to see here, move on.
|
||||
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
if (!isActive[i])
|
||||
continue;
|
||||
st = probers[i].HandleData(newBuf, 0, newBuf.Length);
|
||||
|
||||
if (st == ProbingState.FoundIt) {
|
||||
bestGuess = i;
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
} else if (st == ProbingState.NotMe) {
|
||||
isActive[i] = false;
|
||||
activeNum--;
|
||||
if (activeNum <= 0) {
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
float bestConf = 0.0f, cf;
|
||||
switch (state) {
|
||||
case ProbingState.FoundIt:
|
||||
return 0.99f; //sure yes
|
||||
case ProbingState.NotMe:
|
||||
return 0.01f; //sure no
|
||||
default:
|
||||
for (int i = 0; i < PROBERS_NUM; i++)
|
||||
{
|
||||
if (!isActive[i])
|
||||
continue;
|
||||
cf = probers[i].GetConfidence();
|
||||
if (bestConf < cf)
|
||||
{
|
||||
bestConf = cf;
|
||||
bestGuess = i;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
return bestConf;
|
||||
}
|
||||
|
||||
public override void DumpStatus()
|
||||
{
|
||||
float cf = GetConfidence();
|
||||
// Console.WriteLine(" SBCS Group Prober --------begin status");
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
if (isActive[i])
|
||||
probers[i].DumpStatus();
|
||||
//else
|
||||
//Console.WriteLine(" inactive: [{0}] (i.e. confidence is too low).", probers[i].GetCharsetName());
|
||||
|
||||
}
|
||||
//Console.WriteLine(" SBCS Group found best match [{0}] confidence {1}.", probers[bestGuess].GetCharsetName(), cf);
|
||||
}
|
||||
|
||||
public override void Reset ()
|
||||
{
|
||||
int activeNum = 0;
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
if (probers[i] != null) {
|
||||
probers[i].Reset();
|
||||
isActive[i] = true;
|
||||
activeNum++;
|
||||
} else {
|
||||
isActive[i] = false;
|
||||
}
|
||||
}
|
||||
bestGuess = -1;
|
||||
state = ProbingState.Detecting;
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
//if we have no answer yet
|
||||
if (bestGuess == -1) {
|
||||
GetConfidence();
|
||||
//no charset seems positive
|
||||
if (bestGuess == -1)
|
||||
bestGuess = 0;
|
||||
}
|
||||
return probers[bestGuess].GetCharsetName();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,170 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
|
||||
public class SingleByteCharSetProber : CharsetProber
|
||||
{
|
||||
private const int SAMPLE_SIZE = 64;
|
||||
private const int SB_ENOUGH_REL_THRESHOLD = 1024;
|
||||
private const float POSITIVE_SHORTCUT_THRESHOLD = 0.95f;
|
||||
private const float NEGATIVE_SHORTCUT_THRESHOLD = 0.05f;
|
||||
private const int SYMBOL_CAT_ORDER = 250;
|
||||
private const int NUMBER_OF_SEQ_CAT = 4;
|
||||
private const int POSITIVE_CAT = NUMBER_OF_SEQ_CAT-1;
|
||||
private const int NEGATIVE_CAT = 0;
|
||||
|
||||
protected SequenceModel model;
|
||||
|
||||
// true if we need to reverse every pair in the model lookup
|
||||
bool reversed;
|
||||
|
||||
// char order of last character
|
||||
byte lastOrder;
|
||||
|
||||
int totalSeqs;
|
||||
int totalChar;
|
||||
int[] seqCounters = new int[NUMBER_OF_SEQ_CAT];
|
||||
|
||||
// characters that fall in our sampling range
|
||||
int freqChar;
|
||||
|
||||
// Optional auxiliary prober for name decision. created and destroyed by the GroupProber
|
||||
CharsetProber nameProber;
|
||||
|
||||
public SingleByteCharSetProber(SequenceModel model)
|
||||
: this(model, false, null)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
public SingleByteCharSetProber(SequenceModel model, bool reversed,
|
||||
CharsetProber nameProber)
|
||||
{
|
||||
this.model = model;
|
||||
this.reversed = reversed;
|
||||
this.nameProber = nameProber;
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
byte order = model.GetOrder(buf[i]);
|
||||
|
||||
if (order < SYMBOL_CAT_ORDER)
|
||||
totalChar++;
|
||||
|
||||
if (order < SAMPLE_SIZE) {
|
||||
freqChar++;
|
||||
|
||||
if (lastOrder < SAMPLE_SIZE) {
|
||||
totalSeqs++;
|
||||
if (!reversed)
|
||||
++(seqCounters[model.GetPrecedence(lastOrder*SAMPLE_SIZE+order)]);
|
||||
else // reverse the order of the letters in the lookup
|
||||
++(seqCounters[model.GetPrecedence(order*SAMPLE_SIZE+lastOrder)]);
|
||||
}
|
||||
}
|
||||
lastOrder = order;
|
||||
}
|
||||
|
||||
if (state == ProbingState.Detecting) {
|
||||
if (totalSeqs > SB_ENOUGH_REL_THRESHOLD) {
|
||||
float cf = GetConfidence();
|
||||
if (cf > POSITIVE_SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.NotMe;
|
||||
}
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
||||
public override void DumpStatus()
|
||||
{
|
||||
//Console.WriteLine(" SBCS: {0} [{1}]", GetConfidence(), GetCharsetName());
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
/*
|
||||
NEGATIVE_APPROACH
|
||||
if (totalSeqs > 0) {
|
||||
if (totalSeqs > seqCounters[NEGATIVE_CAT] * 10)
|
||||
return (totalSeqs - seqCounters[NEGATIVE_CAT] * 10)/totalSeqs * freqChar / mTotalChar;
|
||||
}
|
||||
return 0.01f;
|
||||
*/
|
||||
// POSITIVE_APPROACH
|
||||
float r = 0.0f;
|
||||
|
||||
if (totalSeqs > 0) {
|
||||
r = 1.0f * seqCounters[POSITIVE_CAT] / totalSeqs / model.TypicalPositiveRatio;
|
||||
r = r * freqChar / totalChar;
|
||||
if (r >= 1.0f)
|
||||
r = 0.99f;
|
||||
return r;
|
||||
}
|
||||
return 0.01f;
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
state = ProbingState.Detecting;
|
||||
lastOrder = 255;
|
||||
for (int i = 0; i < NUMBER_OF_SEQ_CAT; i++)
|
||||
seqCounters[i] = 0;
|
||||
totalSeqs = 0;
|
||||
totalChar = 0;
|
||||
freqChar = 0;
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return (nameProber == null) ? model.CharsetName
|
||||
: nameProber.GetCharsetName();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,116 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
/// <summary>
|
||||
/// for S-JIS encoding, observe characteristic:
|
||||
/// 1, kana character (or hankaku?) often have hight frequency of appereance
|
||||
/// 2, kana character often exist in group
|
||||
/// 3, certain combination of kana is never used in japanese language
|
||||
/// </summary>
|
||||
public class SJISProber : CharsetProber
|
||||
{
|
||||
private CodingStateMachine codingSM;
|
||||
private SJISContextAnalyser contextAnalyser;
|
||||
private SJISDistributionAnalyser distributionAnalyser;
|
||||
private byte[] lastChar = new byte[2];
|
||||
|
||||
public SJISProber()
|
||||
{
|
||||
codingSM = new CodingStateMachine(new SJISSMModel());
|
||||
distributionAnalyser = new SJISDistributionAnalyser();
|
||||
contextAnalyser = new SJISContextAnalyser();
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "Shift-JIS";
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME) {
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START) {
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset) {
|
||||
lastChar[1] = buf[offset];
|
||||
contextAnalyser.HandleOneChar(lastChar, 2-charLen, charLen);
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
} else {
|
||||
contextAnalyser.HandleOneChar(buf, i+1-charLen, charLen);
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
if (state == ProbingState.Detecting)
|
||||
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
return state;
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
contextAnalyser.Reset();
|
||||
distributionAnalyser.Reset();
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
float contxtCf = contextAnalyser.GetConfidence();
|
||||
float distribCf = distributionAnalyser.GetConfidence();
|
||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,83 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Kohei TAKETA <k-tak@void.in> (Java port)
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
/// <summary>
|
||||
/// State machine model
|
||||
/// </summary>
|
||||
public abstract class SMModel
|
||||
{
|
||||
public const int START = 0;
|
||||
public const int ERROR = 1;
|
||||
public const int ITSME = 2;
|
||||
|
||||
public BitPackage classTable;
|
||||
public BitPackage stateTable;
|
||||
public int[] charLenTable;
|
||||
|
||||
private string name;
|
||||
|
||||
public string Name {
|
||||
get { return name; }
|
||||
}
|
||||
|
||||
private int classFactor;
|
||||
|
||||
public int ClassFactor {
|
||||
get { return classFactor; }
|
||||
}
|
||||
|
||||
public SMModel(BitPackage classTable, int classFactor,
|
||||
BitPackage stateTable, int[] charLenTable, String name)
|
||||
{
|
||||
this.classTable = classTable;
|
||||
this.classFactor = classFactor;
|
||||
this.stateTable = stateTable;
|
||||
this.charLenTable = charLenTable;
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public int GetClass(byte b)
|
||||
{
|
||||
return classTable.Unpack((int)b);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,97 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class SequenceModel
|
||||
{
|
||||
// [256] table use to find a char's order
|
||||
protected byte[] charToOrderMap;
|
||||
|
||||
// [SAMPLE_SIZE][SAMPLE_SIZE] table to find a 2-char sequence's
|
||||
// frequency
|
||||
protected byte[] precedenceMatrix;
|
||||
|
||||
// freqSeqs / totalSeqs
|
||||
protected float typicalPositiveRatio;
|
||||
|
||||
public float TypicalPositiveRatio {
|
||||
get { return typicalPositiveRatio; }
|
||||
}
|
||||
|
||||
// not used
|
||||
protected bool keepEnglishLetter;
|
||||
|
||||
public bool KeepEnglishLetter {
|
||||
get { return keepEnglishLetter; }
|
||||
}
|
||||
|
||||
protected String charsetName;
|
||||
|
||||
public string CharsetName {
|
||||
get { return charsetName; }
|
||||
}
|
||||
|
||||
public SequenceModel(
|
||||
byte[] charToOrderMap,
|
||||
byte[] precedenceMatrix,
|
||||
float typicalPositiveRatio,
|
||||
bool keepEnglishLetter,
|
||||
String charsetName)
|
||||
{
|
||||
this.charToOrderMap = charToOrderMap;
|
||||
this.precedenceMatrix = precedenceMatrix;
|
||||
this.typicalPositiveRatio = typicalPositiveRatio;
|
||||
this.keepEnglishLetter = keepEnglishLetter;
|
||||
this.charsetName = charsetName;
|
||||
}
|
||||
|
||||
public byte GetOrder(byte b)
|
||||
{
|
||||
return charToOrderMap[b];
|
||||
}
|
||||
|
||||
public byte GetPrecedence(int pos)
|
||||
{
|
||||
return precedenceMatrix[pos];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,112 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class UTF8Prober : CharsetProber
|
||||
{
|
||||
private static float ONE_CHAR_PROB = 0.50f;
|
||||
private CodingStateMachine codingSM;
|
||||
private int numOfMBChar;
|
||||
|
||||
public UTF8Prober()
|
||||
{
|
||||
numOfMBChar = 0;
|
||||
codingSM = new CodingStateMachine(new UTF8SMModel());
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName() {
|
||||
return "UTF-8";
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
numOfMBChar = 0;
|
||||
state = ProbingState.Detecting;
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState = SMModel.START;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
|
||||
if (codingState == SMModel.ERROR) {
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
|
||||
if (codingState == SMModel.ITSME) {
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
|
||||
if (codingState == SMModel.START) {
|
||||
if (codingSM.CurrentCharLen >= 2)
|
||||
numOfMBChar++;
|
||||
}
|
||||
}
|
||||
|
||||
if (state == ProbingState.Detecting)
|
||||
if (GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
return state;
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
float unlike = 0.99f;
|
||||
float confidence = 0.0f;
|
||||
|
||||
if (numOfMBChar < 6) {
|
||||
for (int i = 0; i < numOfMBChar; i++)
|
||||
unlike *= ONE_CHAR_PROB;
|
||||
confidence = 1.0f - unlike;
|
||||
} else {
|
||||
confidence = 0.99f;
|
||||
}
|
||||
return confidence;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,257 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
|
||||
enum InputState { PureASCII=0, EscASCII=1, Highbyte=2 };
|
||||
|
||||
public abstract class UniversalDetector
|
||||
{
|
||||
protected const int FILTER_CHINESE_SIMPLIFIED = 1;
|
||||
protected const int FILTER_CHINESE_TRADITIONAL = 2;
|
||||
protected const int FILTER_JAPANESE = 4;
|
||||
protected const int FILTER_KOREAN = 8;
|
||||
protected const int FILTER_NON_CJK = 16;
|
||||
protected const int FILTER_ALL = 31;
|
||||
protected static int FILTER_CHINESE =
|
||||
FILTER_CHINESE_SIMPLIFIED | FILTER_CHINESE_TRADITIONAL;
|
||||
protected static int FILTER_CJK =
|
||||
FILTER_JAPANESE | FILTER_KOREAN | FILTER_CHINESE_SIMPLIFIED
|
||||
| FILTER_CHINESE_TRADITIONAL;
|
||||
|
||||
protected const float SHORTCUT_THRESHOLD = 0.95f;
|
||||
protected const float MINIMUM_THRESHOLD = 0.20f;
|
||||
|
||||
internal InputState inputState;
|
||||
protected bool start;
|
||||
protected bool gotData;
|
||||
protected bool done;
|
||||
protected byte lastChar;
|
||||
protected int bestGuess;
|
||||
protected const int PROBERS_NUM = 3;
|
||||
protected int languageFilter;
|
||||
protected CharsetProber[] charsetProbers = new CharsetProber[PROBERS_NUM];
|
||||
protected CharsetProber escCharsetProber;
|
||||
protected string detectedCharset;
|
||||
|
||||
public UniversalDetector(int languageFilter) {
|
||||
this.start = true;
|
||||
this.inputState = InputState.PureASCII;
|
||||
this.lastChar = 0x00;
|
||||
this.bestGuess = -1;
|
||||
this.languageFilter = languageFilter;
|
||||
}
|
||||
|
||||
public virtual void Feed(byte[] buf, int offset, int len)
|
||||
{
|
||||
if (done) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (len > 0)
|
||||
gotData = true;
|
||||
|
||||
// If the data starts with BOM, we know it is UTF
|
||||
if (start) {
|
||||
start = false;
|
||||
if (len > 3) {
|
||||
switch (buf[0]) {
|
||||
case 0xEF:
|
||||
if (0xBB == buf[1] && 0xBF == buf[2])
|
||||
detectedCharset = "UTF-8";
|
||||
break;
|
||||
case 0xFE:
|
||||
if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
|
||||
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||
detectedCharset = "X-ISO-10646-UCS-4-3412";
|
||||
else if (0xFF == buf[1])
|
||||
detectedCharset = "UTF-16BE";
|
||||
break;
|
||||
case 0x00:
|
||||
if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3])
|
||||
detectedCharset = "UTF-32BE";
|
||||
else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3])
|
||||
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||
detectedCharset = "X-ISO-10646-UCS-4-2143";
|
||||
break;
|
||||
case 0xFF:
|
||||
if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
|
||||
detectedCharset = "UTF-32LE";
|
||||
else if (0xFE == buf[1])
|
||||
detectedCharset = "UTF-16LE";
|
||||
break;
|
||||
} // switch
|
||||
}
|
||||
if (detectedCharset != null) {
|
||||
done = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
|
||||
// other than 0xa0, if every other character is ascii, the page is ascii
|
||||
if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) {
|
||||
// we got a non-ascii byte (high-byte)
|
||||
if (inputState != InputState.Highbyte) {
|
||||
inputState = InputState.Highbyte;
|
||||
|
||||
// kill EscCharsetProber if it is active
|
||||
if (escCharsetProber != null) {
|
||||
escCharsetProber = null;
|
||||
}
|
||||
|
||||
// start multibyte and singlebyte charset prober
|
||||
if (charsetProbers[0] == null)
|
||||
charsetProbers[0] = new MBCSGroupProber();
|
||||
if (charsetProbers[1] == null)
|
||||
charsetProbers[1] = new SBCSGroupProber();
|
||||
if (charsetProbers[2] == null)
|
||||
charsetProbers[2] = new Latin1Prober();
|
||||
}
|
||||
} else {
|
||||
if (inputState == InputState.PureASCII &&
|
||||
(buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E))) {
|
||||
// found escape character or HZ "~{"
|
||||
inputState = InputState.EscASCII;
|
||||
}
|
||||
lastChar = buf[i];
|
||||
}
|
||||
}
|
||||
|
||||
ProbingState st = ProbingState.NotMe;
|
||||
|
||||
switch (inputState) {
|
||||
case InputState.EscASCII:
|
||||
if (escCharsetProber == null) {
|
||||
escCharsetProber = new EscCharsetProber();
|
||||
}
|
||||
st = escCharsetProber.HandleData(buf, offset, len);
|
||||
if (st == ProbingState.FoundIt) {
|
||||
done = true;
|
||||
detectedCharset = escCharsetProber.GetCharsetName();
|
||||
}
|
||||
break;
|
||||
case InputState.Highbyte:
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
if (charsetProbers[i] != null) {
|
||||
st = charsetProbers[i].HandleData(buf, offset, len);
|
||||
#if DEBUG
|
||||
charsetProbers[i].DumpStatus();
|
||||
#endif
|
||||
if (st == ProbingState.FoundIt) {
|
||||
done = true;
|
||||
detectedCharset = charsetProbers[i].GetCharsetName();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// pure ascii
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Notify detector that no further data is available.
|
||||
/// </summary>
|
||||
public virtual void DataEnd()
|
||||
{
|
||||
if (!gotData) {
|
||||
// we haven't got any data yet, return immediately
|
||||
// caller program sometimes call DataEnd before anything has
|
||||
// been sent to detector
|
||||
return;
|
||||
}
|
||||
|
||||
if (detectedCharset != null) {
|
||||
done = true;
|
||||
Report(detectedCharset, 1.0f);
|
||||
return;
|
||||
}
|
||||
|
||||
if (inputState == InputState.Highbyte) {
|
||||
float proberConfidence = 0.0f;
|
||||
float maxProberConfidence = 0.0f;
|
||||
int maxProber = 0;
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
if (charsetProbers[i] != null) {
|
||||
proberConfidence = charsetProbers[i].GetConfidence();
|
||||
if (proberConfidence > maxProberConfidence) {
|
||||
maxProberConfidence = proberConfidence;
|
||||
maxProber = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (maxProberConfidence > MINIMUM_THRESHOLD) {
|
||||
Report(charsetProbers[maxProber].GetCharsetName(), maxProberConfidence);
|
||||
}
|
||||
|
||||
} else if (inputState == InputState.PureASCII) {
|
||||
Report("ASCII", 1.0f);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Clear internal state of charset detector.
|
||||
/// In the original interface this method is protected.
|
||||
/// </summary>
|
||||
public virtual void Reset()
|
||||
{
|
||||
done = false;
|
||||
start = true;
|
||||
detectedCharset = null;
|
||||
gotData = false;
|
||||
bestGuess = -1;
|
||||
inputState = InputState.PureASCII;
|
||||
lastChar = 0x00;
|
||||
if (escCharsetProber != null)
|
||||
escCharsetProber.Reset();
|
||||
for (int i = 0; i < PROBERS_NUM; i++)
|
||||
if (charsetProbers[i] != null)
|
||||
charsetProbers[i].Reset();
|
||||
}
|
||||
|
||||
protected abstract void Report(string charset, float confidence);
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,75 @@
|
||||
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is mozilla.org code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either of the GNU General Public License Version 2 or later (the "GPL"),
|
||||
* or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector
|
||||
{
|
||||
/// <summary>
|
||||
/// Indicate how confident the detection module about the return result.
|
||||
///
|
||||
/// NoAnswerYet: the detector have not find out a answer yet based on
|
||||
/// the data it received.
|
||||
///
|
||||
/// BestAnswer: the answer the detector returned is the best one within
|
||||
/// the knowledge of the detector. In other words, the test to all
|
||||
/// other candidates fail.
|
||||
/// For example, the (Shift_JIS/EUC-JP/ISO-2022-JP) detection
|
||||
/// module may return this with answer "Shift_JIS " if it receive
|
||||
/// bytes > 0x80 (which make ISO-2022-JP test failed) and byte
|
||||
/// 0x82 (which may EUC-JP test failed)
|
||||
///
|
||||
/// SureAnswer: the detector is 100% sure about the answer.
|
||||
///
|
||||
/// Example 1: the Shift_JIS/ISO-2022-JP/EUC-JP detector return
|
||||
/// this w/ ISO-2022-JP when it hit one of the following ESC seq
|
||||
/// ESC ( J
|
||||
/// ESC $ @
|
||||
/// ESC $ B
|
||||
///
|
||||
/// Example 2: the detector which can detect UCS2 return w/ UCS2
|
||||
/// when the first 2 byte are BOM mark.
|
||||
/// Example 3: the Korean detector return ISO-2022-KR when it
|
||||
/// hit ESC $ ) C
|
||||
/// </summary>
|
||||
public enum DetectionConfidence
|
||||
{
|
||||
NoAnswerYet = 0,
|
||||
BestAnswer,
|
||||
SureAnswer,
|
||||
NoAnswerMatch
|
||||
}
|
||||
}
|
@ -0,0 +1,88 @@
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System.IO;
|
||||
|
||||
namespace UniversalDetector
|
||||
{
|
||||
public interface ICharsetDetector
|
||||
{
|
||||
|
||||
/// <summary>
|
||||
/// The detected charset. It can be null.
|
||||
/// </summary>
|
||||
string Charset { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The confidence of the detected charset, if any
|
||||
/// </summary>
|
||||
float Confidence { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Feed a block of bytes to the detector.
|
||||
/// </summary>
|
||||
/// <param name="buf">input buffer</param>
|
||||
/// <param name="offset">offset into buffer</param>
|
||||
/// <param name="len">number of available bytes</param>
|
||||
void Feed(byte[] buf, int offset, int len);
|
||||
|
||||
/// <summary>
|
||||
/// Feed a bytes stream to the detector.
|
||||
/// </summary>
|
||||
/// <param name="stream">an input stream</param>
|
||||
void Feed(Stream stream);
|
||||
|
||||
/// <summary>
|
||||
/// Resets the state of the detector.
|
||||
/// </summary>
|
||||
void Reset();
|
||||
|
||||
/// <summary>
|
||||
/// Returns true if the detector has found a result and it is sure about it.
|
||||
/// </summary>
|
||||
/// <returns>true if the detector has detected the encoding</returns>
|
||||
bool IsDone();
|
||||
|
||||
/// <summary>
|
||||
/// Tell the detector that there is no more data and it must take its
|
||||
/// decision.
|
||||
/// </summary>
|
||||
void DataEnd();
|
||||
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue