|
|
@ -38,12 +38,11 @@
|
|
|
|
namespace UniversalDetector.Core
|
|
|
|
namespace UniversalDetector.Core
|
|
|
|
{
|
|
|
|
{
|
|
|
|
/// <summary>
|
|
|
|
/// <summary>
|
|
|
|
/// Base class for the Character Distribution Method, used for
|
|
|
|
/// Base class for the Character Distribution Method, used for
|
|
|
|
/// the CJK encodings
|
|
|
|
/// the CJK encodings
|
|
|
|
/// </summary>
|
|
|
|
/// </summary>
|
|
|
|
public abstract class CharDistributionAnalyser
|
|
|
|
public abstract class CharDistributionAnalyser
|
|
|
|
{
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
protected const float SURE_YES = 0.99f;
|
|
|
|
protected const float SURE_YES = 0.99f;
|
|
|
|
protected const float SURE_NO = 0.01f;
|
|
|
|
protected const float SURE_NO = 0.01f;
|
|
|
|
protected const int MINIMUM_DATA_THRESHOLD = 4;
|
|
|
|
protected const int MINIMUM_DATA_THRESHOLD = 4;
|
|
|
@ -57,19 +56,19 @@ namespace UniversalDetector.Core
|
|
|
|
|
|
|
|
|
|
|
|
//Total character encounted.
|
|
|
|
//Total character encounted.
|
|
|
|
protected int totalChars;
|
|
|
|
protected int totalChars;
|
|
|
|
|
|
|
|
|
|
|
|
// Mapping table to get frequency order from char order (get from GetOrder())
|
|
|
|
// Mapping table to get frequency order from char order (get from GetOrder())
|
|
|
|
protected int[] charToFreqOrder;
|
|
|
|
protected int[] charToFreqOrder;
|
|
|
|
|
|
|
|
|
|
|
|
// Size of above table
|
|
|
|
// Size of above table
|
|
|
|
protected int tableSize;
|
|
|
|
protected int tableSize;
|
|
|
|
|
|
|
|
|
|
|
|
//This is a constant value varies from language to language, it is used
|
|
|
|
//This is a constant value varies from language to language, it is used
|
|
|
|
// in calculating confidence.
|
|
|
|
// in calculating confidence.
|
|
|
|
protected float typicalDistributionRatio;
|
|
|
|
protected float typicalDistributionRatio;
|
|
|
|
|
|
|
|
|
|
|
|
public CharDistributionAnalyser()
|
|
|
|
public CharDistributionAnalyser()
|
|
|
|
{
|
|
|
|
{
|
|
|
|
Reset();
|
|
|
|
Reset();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -77,10 +76,10 @@ namespace UniversalDetector.Core
|
|
|
|
/// Feed a block of data and do distribution analysis
|
|
|
|
/// Feed a block of data and do distribution analysis
|
|
|
|
/// </summary>
|
|
|
|
/// </summary>
|
|
|
|
/// </param>
|
|
|
|
/// </param>
|
|
|
|
//public abstract void HandleData(byte[] buf, int offset, int len);
|
|
|
|
//public abstract void HandleData(byte[] buf, int offset, int len);
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// <summary>
|
|
|
|
/// we do not handle character base on its original encoding string, but
|
|
|
|
/// we do not handle character base on its original encoding string, but
|
|
|
|
/// convert this encoding string to a number, here called order.
|
|
|
|
/// convert this encoding string to a number, here called order.
|
|
|
|
/// This allow multiple encoding of a language to share one frequency table
|
|
|
|
/// This allow multiple encoding of a language to share one frequency table
|
|
|
|
/// </summary>
|
|
|
|
/// </summary>
|
|
|
@ -88,9 +87,9 @@ namespace UniversalDetector.Core
|
|
|
|
/// <param name="offset"></param>
|
|
|
|
/// <param name="offset"></param>
|
|
|
|
/// <returns></returns>
|
|
|
|
/// <returns></returns>
|
|
|
|
public abstract int GetOrder(byte[] buf, int offset);
|
|
|
|
public abstract int GetOrder(byte[] buf, int offset);
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// <summary>
|
|
|
|
/// Feed a character with known length
|
|
|
|
/// Feed a character with known length
|
|
|
|
/// </summary>
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="buf">A <see cref="System.Byte"/></param>
|
|
|
|
/// <param name="buf">A <see cref="System.Byte"/></param>
|
|
|
|
/// <param name="offset">buf offset</param>
|
|
|
|
/// <param name="offset">buf offset</param>
|
|
|
@ -107,13 +106,13 @@ namespace UniversalDetector.Core
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public virtual void Reset()
|
|
|
|
public virtual void Reset()
|
|
|
|
{
|
|
|
|
{
|
|
|
|
done = false;
|
|
|
|
done = false;
|
|
|
|
totalChars = 0;
|
|
|
|
totalChars = 0;
|
|
|
|
freqChars = 0;
|
|
|
|
freqChars = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// <summary>
|
|
|
|
/// return confidence base on received data
|
|
|
|
/// return confidence base on received data
|
|
|
|
/// </summary>
|
|
|
|
/// </summary>
|
|
|
@ -133,16 +132,16 @@ namespace UniversalDetector.Core
|
|
|
|
//normalize confidence, (we don't want to be 100% sure)
|
|
|
|
//normalize confidence, (we don't want to be 100% sure)
|
|
|
|
return SURE_YES;
|
|
|
|
return SURE_YES;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//It is not necessary to receive all data to draw conclusion. For charset detection,
|
|
|
|
//It is not necessary to receive all data to draw conclusion. For charset detection,
|
|
|
|
// certain amount of data is enough
|
|
|
|
// certain amount of data is enough
|
|
|
|
public bool GotEnoughData()
|
|
|
|
public bool GotEnoughData()
|
|
|
|
{
|
|
|
|
{
|
|
|
|
return totalChars > ENOUGH_DATA_THRESHOLD;
|
|
|
|
return totalChars > ENOUGH_DATA_THRESHOLD;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public class GB18030DistributionAnalyser : CharDistributionAnalyser
|
|
|
|
public class GB18030DistributionAnalyser : CharDistributionAnalyser
|
|
|
|
{
|
|
|
|
{
|
|
|
|
// GB2312 most frequently used character table
|
|
|
|
// GB2312 most frequently used character table
|
|
|
@ -155,7 +154,7 @@ namespace UniversalDetector.Core
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Idea Distribution Ratio = 0.79135/(1-0.79135) = 3.79
|
|
|
|
* Idea Distribution Ratio = 0.79135/(1-0.79135) = 3.79
|
|
|
|
* Random Distribution Ration = 512 / (3755 - 512) = 0.157
|
|
|
|
* Random Distribution Ration = 512 / (3755 - 512) = 0.157
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Typical Distribution Ratio about 25% of Ideal one, still much higher that RDR
|
|
|
|
* Typical Distribution Ratio about 25% of Ideal one, still much higher that RDR
|
|
|
|
*****************************************************************************/
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
|
|
|
@ -400,8 +399,8 @@ namespace UniversalDetector.Core
|
|
|
|
381,1638,4592,1020, 516,3214, 458, 947,4575,1432, 211,1514,2926,1865,2142, 189,
|
|
|
|
381,1638,4592,1020, 516,3214, 458, 947,4575,1432, 211,1514,2926,1865,2142, 189,
|
|
|
|
852,1221,1400,1486, 882,2299,4036, 351, 28,1122, 700,6479,6480,6481,6482,6483, //last 512
|
|
|
|
852,1221,1400,1486, 882,2299,4036, 351, 28,1122, 700,6479,6480,6481,6482,6483, //last 512
|
|
|
|
|
|
|
|
|
|
|
|
/***************************************************************************************
|
|
|
|
/***************************************************************************************
|
|
|
|
*Everything below is of no interest for detection purpose *
|
|
|
|
*Everything below is of no interest for detection purpose *
|
|
|
|
***************************************************************************************
|
|
|
|
***************************************************************************************
|
|
|
|
|
|
|
|
|
|
|
|
5508,6484,3900,3414,3974,4441,4024,3537,4037,5628,5099,3633,6485,3148,6486,3636,
|
|
|
|
5508,6484,3900,3414,3974,4441,4024,3537,4037,5628,5099,3633,6485,3148,6486,3636,
|
|
|
@ -601,7 +600,7 @@ namespace UniversalDetector.Core
|
|
|
|
tableSize = GB2312_TABLE_SIZE;
|
|
|
|
tableSize = GB2312_TABLE_SIZE;
|
|
|
|
typicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO;
|
|
|
|
typicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// <summary>
|
|
|
|
/// for GB2312 encoding, we are interested
|
|
|
|
/// for GB2312 encoding, we are interested
|
|
|
|
/// first byte range: 0xb0 -- 0xfe
|
|
|
|
/// first byte range: 0xb0 -- 0xfe
|
|
|
@ -609,20 +608,20 @@ namespace UniversalDetector.Core
|
|
|
|
/// no validation needed here. State machine has done that
|
|
|
|
/// no validation needed here. State machine has done that
|
|
|
|
/// </summary>
|
|
|
|
/// </summary>
|
|
|
|
/// <returns></returns>
|
|
|
|
/// <returns></returns>
|
|
|
|
public override int GetOrder(byte[] buf, int offset)
|
|
|
|
public override int GetOrder(byte[] buf, int offset)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
if (buf[offset] >= 0xB0 && buf[offset+1] >= 0xA1)
|
|
|
|
if (buf[offset] >= 0xB0 && buf[offset+1] >= 0xA1)
|
|
|
|
return 94 * (buf[offset] - 0xb0) + buf[offset+1] - 0xA1;
|
|
|
|
return 94 * (buf[offset] - 0xb0) + buf[offset+1] - 0xA1;
|
|
|
|
else
|
|
|
|
else
|
|
|
|
return -1;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public class EUCTWDistributionAnalyser : CharDistributionAnalyser
|
|
|
|
public class EUCTWDistributionAnalyser : CharDistributionAnalyser
|
|
|
|
{
|
|
|
|
{
|
|
|
|
// EUCTW frequency table
|
|
|
|
// EUCTW frequency table
|
|
|
|
// Converted from big5 work
|
|
|
|
// Converted from big5 work
|
|
|
|
// by Taiwan's Mandarin Promotion Council
|
|
|
|
// by Taiwan's Mandarin Promotion Council
|
|
|
|
// <http://www.edu.tw:81/mandr/>
|
|
|
|
// <http://www.edu.tw:81/mandr/>
|
|
|
|
/******************************************************************************
|
|
|
|
/******************************************************************************
|
|
|
|
* 128 --> 0.42261
|
|
|
|
* 128 --> 0.42261
|
|
|
@ -633,7 +632,7 @@ namespace UniversalDetector.Core
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Idea Distribution Ratio = 0.74851/(1-0.74851) =2.98
|
|
|
|
* Idea Distribution Ratio = 0.74851/(1-0.74851) =2.98
|
|
|
|
* Random Distribution Ration = 512/(5401-512)=0.105
|
|
|
|
* Random Distribution Ration = 512/(5401-512)=0.105
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
|
|
|
|
* Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
|
|
|
|
*****************************************************************************/
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
|
|
|
@ -979,8 +978,8 @@ namespace UniversalDetector.Core
|
|
|
|
890,3614,3864,8110,1877,3732,3402,8111,2183,2353,3403,1652,8112,8113,8114, 941, // 8086
|
|
|
|
890,3614,3864,8110,1877,3732,3402,8111,2183,2353,3403,1652,8112,8113,8114, 941, // 8086
|
|
|
|
2294, 208,3499,4057,2019, 330,4294,3865,2892,2492,3733,4295,8115,8116,8117,8118, // 8102
|
|
|
|
2294, 208,3499,4057,2019, 330,4294,3865,2892,2492,3733,4295,8115,8116,8117,8118, // 8102
|
|
|
|
|
|
|
|
|
|
|
|
/***************************************************************************************
|
|
|
|
/***************************************************************************************
|
|
|
|
*Everything below is of no interest for detection purpose *
|
|
|
|
*Everything below is of no interest for detection purpose *
|
|
|
|
***************************************************************************************
|
|
|
|
***************************************************************************************
|
|
|
|
|
|
|
|
|
|
|
|
2515,1613,4582,8119,3312,3866,2516,8120,4058,8121,1637,4059,2466,4583,3867,8122, // 8118
|
|
|
|
2515,1613,4582,8119,3312,3866,2516,8120,4058,8121,1637,4059,2466,4583,3867,8122, // 8118
|
|
|
@ -1022,7 +1021,7 @@ namespace UniversalDetector.Core
|
|
|
|
8678,8679,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689,8690,8691,8692,8693, // 8694
|
|
|
|
8678,8679,8680,8681,8682,8683,8684,8685,8686,8687,8688,8689,8690,8691,8692,8693, // 8694
|
|
|
|
8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, // 8710
|
|
|
|
8694,8695,8696,8697,8698,8699,8700,8701,8702,8703,8704,8705,8706,8707,8708,8709, // 8710
|
|
|
|
8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, // 8726
|
|
|
|
8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,8720,8721,8722,8723,8724,8725, // 8726
|
|
|
|
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741, // 8742 //13973
|
|
|
|
8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,8737,8738,8739,8740,8741, // 8742 //13973
|
|
|
|
****************************************************************************************/
|
|
|
|
****************************************************************************************/
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
@ -1038,15 +1037,15 @@ namespace UniversalDetector.Core
|
|
|
|
/// second byte range: 0xa1 -- 0xfe
|
|
|
|
/// second byte range: 0xa1 -- 0xfe
|
|
|
|
/// no validation needed here. State machine has done that
|
|
|
|
/// no validation needed here. State machine has done that
|
|
|
|
/// </summary>
|
|
|
|
/// </summary>
|
|
|
|
public override int GetOrder(byte[] buf, int offset)
|
|
|
|
public override int GetOrder(byte[] buf, int offset)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
if (buf[offset] >= 0xC4)
|
|
|
|
if (buf[offset] >= 0xC4)
|
|
|
|
return 94 * (buf[offset] - 0xC4) + buf[offset+1] - 0xA1;
|
|
|
|
return 94 * (buf[offset] - 0xC4) + buf[offset+1] - 0xA1;
|
|
|
|
else
|
|
|
|
else
|
|
|
|
return -1;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public class EUCKRDistributionAnalyser : CharDistributionAnalyser
|
|
|
|
public class EUCKRDistributionAnalyser : CharDistributionAnalyser
|
|
|
|
{
|
|
|
|
{
|
|
|
|
// Sampling from about 20M text materials include literature and computer technology
|
|
|
|
// Sampling from about 20M text materials include literature and computer technology
|
|
|
@ -1215,8 +1214,8 @@ namespace UniversalDetector.Core
|
|
|
|
2629,2630,2631, 924, 648, 863, 603,2632,2633, 934,1540, 864, 865,2634, 642,1042,
|
|
|
|
2629,2630,2631, 924, 648, 863, 603,2632,2633, 934,1540, 864, 865,2634, 642,1042,
|
|
|
|
670,1190,2635,2636,2637,2638, 168,2639, 652, 873, 542,1054,1541,2640,2641,2642, //512, 256
|
|
|
|
670,1190,2635,2636,2637,2638, 168,2639, 652, 873, 542,1054,1541,2640,2641,2642, //512, 256
|
|
|
|
|
|
|
|
|
|
|
|
/***************************************************************************************
|
|
|
|
/***************************************************************************************
|
|
|
|
* Everything below is of no interest for detection purpose
|
|
|
|
* Everything below is of no interest for detection purpose *
|
|
|
|
***************************************************************************************
|
|
|
|
***************************************************************************************
|
|
|
|
|
|
|
|
|
|
|
|
2643,2644,2645,2646,2647,2648,2649,2650,2651,2652,2653,2654,2655,2656,2657,2658,
|
|
|
|
2643,2644,2645,2646,2647,2648,2649,2650,2651,2652,2653,2654,2655,2656,2657,2658,
|
|
|
@ -1619,32 +1618,32 @@ namespace UniversalDetector.Core
|
|
|
|
8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
|
|
|
|
8704,8705,8706,8707,8708,8709,8710,8711,8712,8713,8714,8715,8716,8717,8718,8719,
|
|
|
|
8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
|
|
|
|
8720,8721,8722,8723,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733,8734,8735,
|
|
|
|
8736,8737,8738,8739,8740,8741 */ };
|
|
|
|
8736,8737,8738,8739,8740,8741 */ };
|
|
|
|
|
|
|
|
|
|
|
|
public EUCKRDistributionAnalyser()
|
|
|
|
public EUCKRDistributionAnalyser()
|
|
|
|
{
|
|
|
|
{
|
|
|
|
charToFreqOrder = EUCKR_CHAR2FREQ_ORDER;
|
|
|
|
charToFreqOrder = EUCKR_CHAR2FREQ_ORDER;
|
|
|
|
tableSize = EUCKR_TABLE_SIZE;
|
|
|
|
tableSize = EUCKR_TABLE_SIZE;
|
|
|
|
typicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO;
|
|
|
|
typicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// <summary>
|
|
|
|
/// first byte range: 0xb0 -- 0xfe
|
|
|
|
/// first byte range: 0xb0 -- 0xfe
|
|
|
|
/// second byte range: 0xa1 -- 0xfe
|
|
|
|
/// second byte range: 0xa1 -- 0xfe
|
|
|
|
/// no validation needed here. State machine has done that
|
|
|
|
/// no validation needed here. State machine has done that
|
|
|
|
/// </summary>
|
|
|
|
/// </summary>
|
|
|
|
public override int GetOrder(byte[] buf, int offset)
|
|
|
|
public override int GetOrder(byte[] buf, int offset)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
if (buf[offset] >= 0xB0)
|
|
|
|
if (buf[offset] >= 0xB0)
|
|
|
|
return 94 * (buf[offset] - 0xB0) + buf[offset+1] - 0xA1;
|
|
|
|
return 94 * (buf[offset] - 0xB0) + buf[offset+1] - 0xA1;
|
|
|
|
else
|
|
|
|
else
|
|
|
|
return -1;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public class BIG5DistributionAnalyser : CharDistributionAnalyser
|
|
|
|
public class BIG5DistributionAnalyser : CharDistributionAnalyser
|
|
|
|
{
|
|
|
|
{
|
|
|
|
// Big5 frequency table
|
|
|
|
// Big5 frequency table
|
|
|
|
// by Taiwan's Mandarin Promotion Council
|
|
|
|
// by Taiwan's Mandarin Promotion Council
|
|
|
|
// <http://www.edu.tw:81/mandr/>
|
|
|
|
// <http://www.edu.tw:81/mandr/>
|
|
|
|
/******************************************************************************
|
|
|
|
/******************************************************************************
|
|
|
|
* 128 --> 0.42261
|
|
|
|
* 128 --> 0.42261
|
|
|
@ -1655,7 +1654,7 @@ namespace UniversalDetector.Core
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Idea Distribution Ratio = 0.74851/(1-0.74851) =2.98
|
|
|
|
* Idea Distribution Ratio = 0.74851/(1-0.74851) =2.98
|
|
|
|
* Random Distribution Ration = 512/(5401-512)=0.105
|
|
|
|
* Random Distribution Ration = 512/(5401-512)=0.105
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
|
|
|
|
* Typical Distribution Ratio about 25% of Ideal one, still much higher than RDR
|
|
|
|
*****************************************************************************/
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
|
|
|
@ -2001,8 +2000,8 @@ namespace UniversalDetector.Core
|
|
|
|
890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, // 5360
|
|
|
|
890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, // 5360
|
|
|
|
2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, // 5376 //last 512
|
|
|
|
2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, // 5376 //last 512
|
|
|
|
|
|
|
|
|
|
|
|
/***************************************************************************************
|
|
|
|
/***************************************************************************************
|
|
|
|
*Everything below is of no interest for detection purpose *
|
|
|
|
*Everything below is of no interest for detection purpose *
|
|
|
|
***************************************************************************************
|
|
|
|
***************************************************************************************
|
|
|
|
|
|
|
|
|
|
|
|
2522,1613,4812,5799,3345,3945,2523,5800,4162,5801,1637,4163,2471,4813,3946,5802, // 5392
|
|
|
|
2522,1613,4812,5799,3345,3945,2523,5800,4162,5801,1637,4163,2471,4813,3946,5802, // 5392
|
|
|
@ -2545,29 +2544,29 @@ namespace UniversalDetector.Core
|
|
|
|
13968,13969,13970,13971,13972, //13973
|
|
|
|
13968,13969,13970,13971,13972, //13973
|
|
|
|
****************************************************************************************/
|
|
|
|
****************************************************************************************/
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
public BIG5DistributionAnalyser()
|
|
|
|
public BIG5DistributionAnalyser()
|
|
|
|
{
|
|
|
|
{
|
|
|
|
charToFreqOrder = BIG5_CHAR2FREQ_ORDER;
|
|
|
|
charToFreqOrder = BIG5_CHAR2FREQ_ORDER;
|
|
|
|
tableSize = BIG5_TABLE_SIZE;
|
|
|
|
tableSize = BIG5_TABLE_SIZE;
|
|
|
|
typicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO;
|
|
|
|
typicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// <summary>
|
|
|
|
/// first byte range: 0xa4 -- 0xfe
|
|
|
|
/// first byte range: 0xa4 -- 0xfe
|
|
|
|
/// second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
|
|
|
/// second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
|
|
|
/// no validation needed here. State machine has done that
|
|
|
|
/// no validation needed here. State machine has done that
|
|
|
|
/// </summary>
|
|
|
|
/// </summary>
|
|
|
|
public override int GetOrder(byte[] buf, int offset)
|
|
|
|
public override int GetOrder(byte[] buf, int offset)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
if (buf[offset] >= 0xA4) {
|
|
|
|
if (buf[offset] >= 0xA4) {
|
|
|
|
if (buf[offset+1] >= 0xA1)
|
|
|
|
if (buf[offset+1] >= 0xA1)
|
|
|
|
return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0xA1 + 63;
|
|
|
|
return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0xA1 + 63;
|
|
|
|
else
|
|
|
|
else
|
|
|
|
return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0x40;
|
|
|
|
return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0x40;
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
return -1;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -2575,7 +2574,7 @@ namespace UniversalDetector.Core
|
|
|
|
{
|
|
|
|
{
|
|
|
|
//Sampling from about 20M text materials include literature and computer technology
|
|
|
|
//Sampling from about 20M text materials include literature and computer technology
|
|
|
|
// Japanese frequency table, applied to both S-JIS and EUC-JP
|
|
|
|
// Japanese frequency table, applied to both S-JIS and EUC-JP
|
|
|
|
//They are sorted in order.
|
|
|
|
//They are sorted in order.
|
|
|
|
|
|
|
|
|
|
|
|
/******************************************************************************
|
|
|
|
/******************************************************************************
|
|
|
|
* 128 --> 0.77094
|
|
|
|
* 128 --> 0.77094
|
|
|
@ -2586,8 +2585,8 @@ namespace UniversalDetector.Core
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Idea Distribution Ratio = 0.92635 / (1-0.92635) = 12.58
|
|
|
|
* Idea Distribution Ratio = 0.92635 / (1-0.92635) = 12.58
|
|
|
|
* Random Distribution Ration = 512 / (2965+62+83+86-512) = 0.191
|
|
|
|
* Random Distribution Ration = 512 / (2965+62+83+86-512) = 0.191
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Typical Distribution Ratio, 25% of IDR
|
|
|
|
* Typical Distribution Ratio, 25% of IDR
|
|
|
|
*****************************************************************************/
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
|
|
|
|
protected static float SJIS_TYPICAL_DISTRIBUTION_RATIO = 3.0f;
|
|
|
|
protected static float SJIS_TYPICAL_DISTRIBUTION_RATIO = 3.0f;
|
|
|
@ -2869,8 +2868,8 @@ namespace UniversalDetector.Core
|
|
|
|
1444,1698,2385,2251,3729,1365,2281,2235,1717,6188, 864,3841,2515, 444, 527,2767, // 4352
|
|
|
|
1444,1698,2385,2251,3729,1365,2281,2235,1717,6188, 864,3841,2515, 444, 527,2767, // 4352
|
|
|
|
2922,3625, 544, 461,6189, 566, 209,2437,3398,2098,1065,2068,3331,3626,3257,2137, // 4368 //last 512
|
|
|
|
2922,3625, 544, 461,6189, 566, 209,2437,3398,2098,1065,2068,3331,3626,3257,2137, // 4368 //last 512
|
|
|
|
|
|
|
|
|
|
|
|
/***************************************************************************************
|
|
|
|
/***************************************************************************************
|
|
|
|
*Everything below is of no interest for detection purpose *
|
|
|
|
*Everything below is of no interest for detection purpose *
|
|
|
|
***************************************************************************************
|
|
|
|
***************************************************************************************
|
|
|
|
|
|
|
|
|
|
|
|
2138,2122,3730,2888,1995,1820,1044,6190,6191,6192,6193,6194,6195,6196,6197,6198, // 4384
|
|
|
|
2138,2122,3730,2888,1995,1820,1044,6190,6191,6192,6193,6194,6195,6196,6197,6198, // 4384
|
|
|
@ -3118,31 +3117,31 @@ namespace UniversalDetector.Core
|
|
|
|
8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, // 8256
|
|
|
|
8240,8241,8242,8243,8244,8245,8246,8247,8248,8249,8250,8251,8252,8253,8254,8255, // 8256
|
|
|
|
8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271, // 8272
|
|
|
|
8256,8257,8258,8259,8260,8261,8262,8263,8264,8265,8266,8267,8268,8269,8270,8271, // 8272
|
|
|
|
****************************************************************************************/
|
|
|
|
****************************************************************************************/
|
|
|
|
};
|
|
|
|
};
|
|
|
|
public SJISDistributionAnalyser()
|
|
|
|
public SJISDistributionAnalyser()
|
|
|
|
{
|
|
|
|
{
|
|
|
|
charToFreqOrder = SJIS_CHAR2FREQ_ORDER;
|
|
|
|
charToFreqOrder = SJIS_CHAR2FREQ_ORDER;
|
|
|
|
tableSize = SJIS_TABLE_SIZE;
|
|
|
|
tableSize = SJIS_TABLE_SIZE;
|
|
|
|
typicalDistributionRatio = SJIS_TYPICAL_DISTRIBUTION_RATIO;
|
|
|
|
typicalDistributionRatio = SJIS_TYPICAL_DISTRIBUTION_RATIO;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// <summary>
|
|
|
|
/// first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
|
|
|
/// first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
|
|
|
/// second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
|
|
|
/// second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
|
|
|
/// no validation needed here. State machine has done that
|
|
|
|
/// no validation needed here. State machine has done that
|
|
|
|
/// </summary>
|
|
|
|
/// </summary>
|
|
|
|
public override int GetOrder(byte[] buf, int offset)
|
|
|
|
public override int GetOrder(byte[] buf, int offset)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
int order = 0;
|
|
|
|
int order = 0;
|
|
|
|
|
|
|
|
|
|
|
|
if (buf[offset] >= 0x81 && buf[offset] <= 0x9F)
|
|
|
|
if (buf[offset] >= 0x81 && buf[offset] <= 0x9F)
|
|
|
|
order = 188 * (buf[offset] - 0x81);
|
|
|
|
order = 188 * (buf[offset] - 0x81);
|
|
|
|
else if (buf[offset] >= 0xE0 && buf[offset] <= 0xEF)
|
|
|
|
else if (buf[offset] >= 0xE0 && buf[offset] <= 0xEF)
|
|
|
|
order = 188 * (buf[offset] - 0xE0 + 31);
|
|
|
|
order = 188 * (buf[offset] - 0xE0 + 31);
|
|
|
|
else
|
|
|
|
else
|
|
|
|
return -1;
|
|
|
|
return -1;
|
|
|
|
order += buf[offset+1] - 0x40;
|
|
|
|
order += buf[offset+1] - 0x40;
|
|
|
|
|
|
|
|
|
|
|
|
if (buf[offset+1] > 0x7F)
|
|
|
|
if (buf[offset+1] > 0x7F)
|
|
|
|
order--;
|
|
|
|
order--;
|
|
|
|
return order;
|
|
|
|
return order;
|
|
|
@ -3154,20 +3153,18 @@ namespace UniversalDetector.Core
|
|
|
|
public EUCJPDistributionAnalyser() : base()
|
|
|
|
public EUCJPDistributionAnalyser() : base()
|
|
|
|
{
|
|
|
|
{
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// <summary>
|
|
|
|
/// first byte range: 0xa0 -- 0xfe
|
|
|
|
/// first byte range: 0xa0 -- 0xfe
|
|
|
|
/// second byte range: 0xa1 -- 0xfe
|
|
|
|
/// second byte range: 0xa1 -- 0xfe
|
|
|
|
/// no validation needed here. State machine has done that
|
|
|
|
/// no validation needed here. State machine has done that
|
|
|
|
/// </summary>
|
|
|
|
/// </summary>
|
|
|
|
public override int GetOrder(byte[] buf, int offset)
|
|
|
|
public override int GetOrder(byte[] buf, int offset)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
if (buf[offset] >= 0xA0)
|
|
|
|
if (buf[offset] >= 0xA0)
|
|
|
|
return 94 * (buf[offset] - 0xA1) + buf[offset+1] - 0xA1;
|
|
|
|
return 94 * (buf[offset] - 0xA1) + buf[offset+1] - 0xA1;
|
|
|
|
else
|
|
|
|
else
|
|
|
|
return -1;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|