| 1 |
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
| 2 |
/* ***** BEGIN LICENSE BLOCK ***** |
| 3 |
* Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
| 4 |
* |
| 5 |
* The contents of this file are subject to the Mozilla Public License Version |
| 6 |
* 1.1 (the "License"); you may not use this file except in compliance with |
| 7 |
* the License. You may obtain a copy of the License at |
| 8 |
* http://www.mozilla.org/MPL/ |
| 9 |
* |
| 10 |
* Software distributed under the License is distributed on an "AS IS" basis, |
| 11 |
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
| 12 |
* for the specific language governing rights and limitations under the |
| 13 |
* License. |
| 14 |
* |
| 15 |
* The Original Code is Mozilla Communicator client code. |
| 16 |
* |
| 17 |
* The Initial Developer of the Original Code is |
| 18 |
* Netscape Communications Corporation. |
| 19 |
* Portions created by the Initial Developer are Copyright (C) 1998 |
| 20 |
* the Initial Developer. All Rights Reserved. |
| 21 |
* |
| 22 |
* Contributor(s): |
| 23 |
* |
| 24 |
* Alternatively, the contents of this file may be used under the terms of |
| 25 |
* either the GNU General Public License Version 2 or later (the "GPL"), or |
| 26 |
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
| 27 |
* in which case the provisions of the GPL or the LGPL are applicable instead |
| 28 |
* of those above. If you wish to allow use of your version of this file only |
| 29 |
* under the terms of either the GPL or the LGPL, and not to allow others to |
| 30 |
* use your version of this file under the terms of the MPL, indicate your |
| 31 |
* decision by deleting the provisions above and replace them with the notice |
| 32 |
* and other provisions required by the GPL or the LGPL. If you do not delete |
| 33 |
* the provisions above, a recipient may use your version of this file under |
| 34 |
* the terms of any one of the MPL, the GPL or the LGPL. |
| 35 |
* |
| 36 |
* ***** END LICENSE BLOCK ***** */ |
| 37 |
|
| 38 |
#ifndef CharDistribution_h__ |
| 39 |
#define CharDistribution_h__ |
| 40 |
|
| 41 |
#include "nscore.h" |
| 42 |
|
| 43 |
#define ENOUGH_DATA_THRESHOLD 1024 |
| 44 |
|
| 45 |
class CharDistributionAnalysis |
| 46 |
{ |
| 47 |
public: |
| 48 |
CharDistributionAnalysis() {Reset();} |
| 49 |
|
| 50 |
//feed a block of data and do distribution analysis |
| 51 |
void HandleData(const char* aBuf, PRUint32 aLen) {} |
| 52 |
|
| 53 |
//Feed a character with known length |
| 54 |
void HandleOneChar(const char* aStr, PRUint32 aCharLen) |
| 55 |
{ |
| 56 |
PRInt32 order; |
| 57 |
|
| 58 |
//we only care about 2-bytes character in our distribution analysis |
| 59 |
order = (aCharLen == 2) ? GetOrder(aStr) : -1; |
| 60 |
|
| 61 |
if (order >= 0) |
| 62 |
{ |
| 63 |
mTotalChars++; |
| 64 |
//order is valid |
| 65 |
if ((PRUint32)order < mTableSize) |
| 66 |
{ |
| 67 |
if (512 > mCharToFreqOrder[order]) |
| 68 |
mFreqChars++; |
| 69 |
} |
| 70 |
} |
| 71 |
} |
| 72 |
|
| 73 |
//return confidence base on existing data |
| 74 |
float GetConfidence(PRBool aIsPreferredLanguage); |
| 75 |
|
| 76 |
//Reset analyser, clear any state |
| 77 |
void Reset(void) |
| 78 |
{ |
| 79 |
mDone = PR_FALSE; |
| 80 |
mTotalChars = 0; |
| 81 |
mFreqChars = 0; |
| 82 |
} |
| 83 |
|
| 84 |
//This function is for future extension. Caller can use this function to control |
| 85 |
//analyser's behavior |
| 86 |
void SetOpion(){} |
| 87 |
|
| 88 |
//It is not necessary to receive all data to draw conclusion. For charset detection, |
| 89 |
// certain amount of data is enough |
| 90 |
PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;} |
| 91 |
|
| 92 |
protected: |
| 93 |
//we do not handle character base on its original encoding string, but |
| 94 |
//convert this encoding string to a number, here called order. |
| 95 |
//This allow multiple encoding of a language to share one frequency table |
| 96 |
virtual PRInt32 GetOrder(const char* str) {return -1;} |
| 97 |
|
| 98 |
//If this flag is set to PR_TRUE, detection is done and conclusion has been made |
| 99 |
PRBool mDone; |
| 100 |
|
| 101 |
//The number of characters whose frequency order is less than 512 |
| 102 |
PRUint32 mFreqChars; |
| 103 |
|
| 104 |
//Total character encounted. |
| 105 |
PRUint32 mTotalChars; |
| 106 |
|
| 107 |
//Mapping table to get frequency order from char order (get from GetOrder()) |
| 108 |
const PRInt16 *mCharToFreqOrder; |
| 109 |
|
| 110 |
//Size of above table |
| 111 |
PRUint32 mTableSize; |
| 112 |
|
| 113 |
//This is a constant value varies from language to language, it is used in |
| 114 |
//calculating confidence. See my paper for further detail. |
| 115 |
float mTypicalDistributionRatio; |
| 116 |
}; |
| 117 |
|
| 118 |
|
| 119 |
class EUCTWDistributionAnalysis: public CharDistributionAnalysis |
| 120 |
{ |
| 121 |
public: |
| 122 |
EUCTWDistributionAnalysis(); |
| 123 |
protected: |
| 124 |
|
| 125 |
//for euc-TW encoding, we are interested |
| 126 |
// first byte range: 0xc4 -- 0xfe |
| 127 |
// second byte range: 0xa1 -- 0xfe |
| 128 |
//no validation needed here. State machine has done that |
| 129 |
PRInt32 GetOrder(const char* str) |
| 130 |
{ if ((unsigned char)*str >= (unsigned char)0xc4) |
| 131 |
return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1; |
| 132 |
else |
| 133 |
return -1; |
| 134 |
} |
| 135 |
}; |
| 136 |
|
| 137 |
|
| 138 |
class EUCKRDistributionAnalysis : public CharDistributionAnalysis |
| 139 |
{ |
| 140 |
public: |
| 141 |
EUCKRDistributionAnalysis(); |
| 142 |
protected: |
| 143 |
//for euc-KR encoding, we are interested |
| 144 |
// first byte range: 0xb0 -- 0xfe |
| 145 |
// second byte range: 0xa1 -- 0xfe |
| 146 |
//no validation needed here. State machine has done that |
| 147 |
PRInt32 GetOrder(const char* str) |
| 148 |
{ if ((unsigned char)*str >= (unsigned char)0xb0) |
| 149 |
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; |
| 150 |
else |
| 151 |
return -1; |
| 152 |
} |
| 153 |
}; |
| 154 |
|
| 155 |
class GB2312DistributionAnalysis : public CharDistributionAnalysis |
| 156 |
{ |
| 157 |
public: |
| 158 |
GB2312DistributionAnalysis(); |
| 159 |
protected: |
| 160 |
//for GB2312 encoding, we are interested |
| 161 |
// first byte range: 0xb0 -- 0xfe |
| 162 |
// second byte range: 0xa1 -- 0xfe |
| 163 |
//no validation needed here. State machine has done that |
| 164 |
PRInt32 GetOrder(const char* str) |
| 165 |
{ if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) |
| 166 |
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; |
| 167 |
else |
| 168 |
return -1; |
| 169 |
} |
| 170 |
}; |
| 171 |
|
| 172 |
|
| 173 |
class Big5DistributionAnalysis : public CharDistributionAnalysis |
| 174 |
{ |
| 175 |
public: |
| 176 |
Big5DistributionAnalysis(); |
| 177 |
protected: |
| 178 |
//for big5 encoding, we are interested |
| 179 |
// first byte range: 0xa4 -- 0xfe |
| 180 |
// second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe |
| 181 |
//no validation needed here. State machine has done that |
| 182 |
PRInt32 GetOrder(const char* str) |
| 183 |
{ if ((unsigned char)*str >= (unsigned char)0xa4) |
| 184 |
if ((unsigned char)str[1] >= (unsigned char)0xa1) |
| 185 |
return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63; |
| 186 |
else |
| 187 |
return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40; |
| 188 |
else |
| 189 |
return -1; |
| 190 |
} |
| 191 |
}; |
| 192 |
|
| 193 |
class SJISDistributionAnalysis : public CharDistributionAnalysis |
| 194 |
{ |
| 195 |
public: |
| 196 |
SJISDistributionAnalysis(); |
| 197 |
protected: |
| 198 |
//for sjis encoding, we are interested |
| 199 |
// first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe |
| 200 |
// second byte range: 0x40 -- 0x7e, 0x81 -- oxfe |
| 201 |
//no validation needed here. State machine has done that |
| 202 |
PRInt32 GetOrder(const char* str) |
| 203 |
{ |
| 204 |
PRInt32 order; |
| 205 |
if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) |
| 206 |
order = 188 * ((unsigned char)str[0]-(unsigned char)0x81); |
| 207 |
else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) |
| 208 |
order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31); |
| 209 |
else |
| 210 |
return -1; |
| 211 |
order += (unsigned char)*(str+1) - 0x40; |
| 212 |
if ((unsigned char)str[1] > (unsigned char)0x7f) |
| 213 |
order--; |
| 214 |
return order; |
| 215 |
} |
| 216 |
}; |
| 217 |
|
| 218 |
class EUCJPDistributionAnalysis : public CharDistributionAnalysis |
| 219 |
{ |
| 220 |
public: |
| 221 |
EUCJPDistributionAnalysis(); |
| 222 |
protected: |
| 223 |
//for euc-JP encoding, we are interested |
| 224 |
// first byte range: 0xa0 -- 0xfe |
| 225 |
// second byte range: 0xa1 -- 0xfe |
| 226 |
//no validation needed here. State machine has done that |
| 227 |
PRInt32 GetOrder(const char* str) |
| 228 |
{ if ((unsigned char)*str >= (unsigned char)0xa0) |
| 229 |
return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1; |
| 230 |
else |
| 231 |
return -1; |
| 232 |
} |
| 233 |
}; |
| 234 |
|
| 235 |
#endif //CharDistribution_h__ |