1
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* ***** BEGIN LICENSE BLOCK *****
3
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4
 *
5
 * The contents of this file are subject to the Mozilla Public License Version
6
 * 1.1 (the "License"); you may not use this file except in compliance with
7
 * the License. You may obtain a copy of the License at
8
 * http://www.mozilla.org/MPL/
9
 *
10
 * Software distributed under the License is distributed on an "AS IS" basis,
11
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12
 * for the specific language governing rights and limitations under the
13
 * License.
14
 *
15
 * The Original Code is Mozilla Universal charset detector code.
16
 *
17
 * The Initial Developer of the Original Code is
18
 * Netscape Communications Corporation.
19
 * Portions created by the Initial Developer are Copyright (C) 2001
20
 * the Initial Developer. All Rights Reserved.
21
 *
22
 * Contributor(s):
23
 *          Shy Shalom <shooshX@gmail.com>
24
 *
25
 * Alternatively, the contents of this file may be used under the terms of
26
 * either the GNU General Public License Version 2 or later (the "GPL"), or
27
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28
 * in which case the provisions of the GPL or the LGPL are applicable instead
29
 * of those above. If you wish to allow use of your version of this file only
30
 * under the terms of either the GPL or the LGPL, and not to allow others to
31
 * use your version of this file under the terms of the MPL, indicate your
32
 * decision by deleting the provisions above and replace them with the notice
33
 * and other provisions required by the GPL or the LGPL. If you do not delete
34
 * the provisions above, a recipient may use your version of this file under
35
 * the terms of any one of the MPL, the GPL or the LGPL.
36
 *
37
 * ***** END LICENSE BLOCK ***** */
38
39
#include "nscore.h"
40
41
#include "nsUniversalDetector.h"
42
43
#include "nsMBCSGroupProber.h"
44
#include "nsSBCSGroupProber.h"
45
#include "nsEscCharsetProber.h"
46
#include "nsLatin1Prober.h"
47
48
nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
49
{
50
  mDone = PR_FALSE;
51
  mBestGuess = -1;   //illegal value as signal
52
  mInTag = PR_FALSE;
53
  mEscCharSetProber = nsnull;
54
55
  mStart = PR_TRUE;
56
  mDetectedCharset = nsnull;
57
  mGotData = PR_FALSE;
58
  mInputState = ePureAscii;
59
  mLastChar = '\0';
60
  mLanguageFilter = aLanguageFilter;
61
62
  PRUint32 i;
63
  for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
64
    mCharSetProbers[i] = nsnull;
65
}
66
67
nsUniversalDetector::~nsUniversalDetector() 
68
{
69
  for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
70
    if (mCharSetProbers[i])      
71
      delete mCharSetProbers[i];
72
  if (mEscCharSetProber)
73
    delete mEscCharSetProber;
74
}
75
76
void 
77
nsUniversalDetector::Reset()
78
{
79
  mDone = PR_FALSE;
80
  mBestGuess = -1;   //illegal value as signal
81
  mInTag = PR_FALSE;
82
83
  mStart = PR_TRUE;
84
  mDetectedCharset = nsnull;
85
  mGotData = PR_FALSE;
86
  mInputState = ePureAscii;
87
  mLastChar = '\0';
88
89
  if (mEscCharSetProber)
90
    mEscCharSetProber->Reset();
91
92
  PRUint32 i;
93
  for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
94
    if (mCharSetProbers[i])
95
      mCharSetProbers[i]->Reset();
96
}
97
98
//---------------------------------------------------------------------
99
#define SHORTCUT_THRESHOLD      (float)0.95
100
#define MINIMUM_THRESHOLD      (float)0.20
101
102
nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
103
{
104
  if(mDone) 
105
    return NS_OK;
106
107
  if (aLen > 0)
108
    mGotData = PR_TRUE;
109
110
  //If the data starts with BOM, we know it is UTF
111
  if (mStart)
112
  {
113
    mStart = PR_FALSE;
114
    if (aLen > 3)
115
      switch (aBuf[0])
116
        {
117
        case '\xEF':
118
          if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
119
            // EF BB BF  UTF-8 encoded BOM
120
            mDetectedCharset = "UTF-8";
121
        break;
122
        case '\xFE':
123
          if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
124
            // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
125
            mDetectedCharset = "X-ISO-10646-UCS-4-3412";
126
          else if ('\xFF' == aBuf[1])
127
            // FE FF  UTF-16, big endian BOM
128
            mDetectedCharset = "UTF-16";
129
        break;
130
        case '\x00':
131
          if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
132
            // 00 00 FE FF  UTF-32, big-endian BOM
133
            mDetectedCharset = "UTF-32";
134
          else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
135
            // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
136
            mDetectedCharset = "X-ISO-10646-UCS-4-2143";
137
        break;
138
        case '\xFF':
139
          if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
140
            // FF FE 00 00  UTF-32, little-endian BOM
141
            mDetectedCharset = "UTF-32";
142
          else if ('\xFE' == aBuf[1])
143
            // FF FE  UTF-16, little endian BOM
144
            mDetectedCharset = "UTF-16";
145
        break;
146
      }  // switch
147
148
      if (mDetectedCharset)
149
      {
150
        mDone = PR_TRUE;
151
        return NS_OK;
152
      }
153
  }
154
  
155
  PRUint32 i;
156
  for (i = 0; i < aLen; i++)
157
  {
158
    //other than 0xa0, if every othe character is ascii, the page is ascii
159
    if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')  //Since many Ascii only page contains NBSP 
160
    {
161
      //we got a non-ascii byte (high-byte)
162
      if (mInputState != eHighbyte)
163
      {
164
        //adjust state
165
        mInputState = eHighbyte;
166
167
        //kill mEscCharSetProber if it is active
168
        if (mEscCharSetProber) {
169
          delete mEscCharSetProber;
170
          mEscCharSetProber = nsnull;
171
        }
172
173
        //start multibyte and singlebyte charset prober
174
        if (nsnull == mCharSetProbers[0])
175
        {
176
          mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
177
          if (nsnull == mCharSetProbers[0])
178
            return NS_ERROR_OUT_OF_MEMORY;
179
        }
180
        if (nsnull == mCharSetProbers[1] &&
181
            (mLanguageFilter & NS_FILTER_NON_CJK))
182
        {
183
          mCharSetProbers[1] = new nsSBCSGroupProber;
184
          if (nsnull == mCharSetProbers[1])
185
            return NS_ERROR_OUT_OF_MEMORY;
186
        }
187
        if (nsnull == mCharSetProbers[2])
188
        {
189
          mCharSetProbers[2] = new nsLatin1Prober; 
190
          if (nsnull == mCharSetProbers[2])
191
            return NS_ERROR_OUT_OF_MEMORY;
192
        }
193
      }
194
    }
195
    else
196
    {
197
      //ok, just pure ascii so far
198
      if ( ePureAscii == mInputState &&
199
        (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
200
      {
201
        //found escape character or HZ "~{"
202
        mInputState = eEscAscii;
203
      }
204
      mLastChar = aBuf[i];
205
    }
206
  }
207
208
  nsProbingState st;
209
  switch (mInputState)
210
  {
211
  case eEscAscii:
212
    if (nsnull == mEscCharSetProber) {
213
      mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
214
      if (nsnull == mEscCharSetProber)
215
        return NS_ERROR_OUT_OF_MEMORY;
216
    }
217
    st = mEscCharSetProber->HandleData(aBuf, aLen);
218
    if (st == eFoundIt)
219
    {
220
      mDone = PR_TRUE;
221
      mDetectedCharset = mEscCharSetProber->GetCharSetName();
222
    }
223
    break;
224
  case eHighbyte:
225
    for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
226
    {
227
      if (mCharSetProbers[i])
228
      {
229
        st = mCharSetProbers[i]->HandleData(aBuf, aLen);
230
        if (st == eFoundIt) 
231
        {
232
          mDone = PR_TRUE;
233
          mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
234
          return NS_OK;
235
        }
236
      } 
237
    }
238
    break;
239
240
  default:  //pure ascii
241
    ;//do nothing here
242
  }
243
  return NS_OK;
244
}
245
246
247
//---------------------------------------------------------------------
248
void nsUniversalDetector::DataEnd()
249
{
250
  if (!mGotData)
251
  {
252
    // we haven't got any data yet, return immediately 
253
    // caller program sometimes call DataEnd before anything has been sent to detector
254
    return;
255
  }
256
257
  if (mDetectedCharset)
258
  {
259
    mDone = PR_TRUE;
260
    Report(mDetectedCharset);
261
    return;
262
  }
263
  
264
  switch (mInputState)
265
  {
266
  case eHighbyte:
267
    {
268
      float proberConfidence;
269
      float maxProberConfidence = (float)0.0;
270
      PRInt32 maxProber = 0;
271
272
      for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
273
      {
274
        if (mCharSetProbers[i])
275
        {
276
          proberConfidence = mCharSetProbers[i]->GetConfidence();
277
          if (proberConfidence > maxProberConfidence)
278
          {
279
            maxProberConfidence = proberConfidence;
280
            maxProber = i;
281
          }
282
        }
283
      }
284
      //do not report anything because we are not confident of it, that's in fact a negative answer
285
      if (maxProberConfidence > MINIMUM_THRESHOLD)
286
        Report(mCharSetProbers[maxProber]->GetCharSetName());
287
    }
288
    break;
289
  case eEscAscii:
290
    break;
291
  default:
292
    ;
293
  }
294
  return;
295
}