1
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* ***** BEGIN LICENSE BLOCK *****
3
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4
 *
5
 * The contents of this file are subject to the Mozilla Public License Version
6
 * 1.1 (the "License"); you may not use this file except in compliance with
7
 * the License. You may obtain a copy of the License at
8
 * http://www.mozilla.org/MPL/
9
 *
10
 * Software distributed under the License is distributed on an "AS IS" basis,
11
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12
 * for the specific language governing rights and limitations under the
13
 * License.
14
 *
15
 * The Original Code is Mozilla Universal charset detector code.
16
 *
17
 * The Initial Developer of the Original Code is
18
 * Netscape Communications Corporation.
19
 * Portions created by the Initial Developer are Copyright (C) 2001
20
 * the Initial Developer. All Rights Reserved.
21
 *
22
 * Contributor(s):
23
 *          Shy Shalom <shooshX@gmail.com>
24
 *
25
 * Alternatively, the contents of this file may be used under the terms of
26
 * either the GNU General Public License Version 2 or later (the "GPL"), or
27
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28
 * in which case the provisions of the GPL or the LGPL are applicable instead
29
 * of those above. If you wish to allow use of your version of this file only
30
 * under the terms of either the GPL or the LGPL, and not to allow others to
31
 * use your version of this file under the terms of the MPL, indicate your
32
 * decision by deleting the provisions above and replace them with the notice
33
 * and other provisions required by the GPL or the LGPL. If you do not delete
34
 * the provisions above, a recipient may use your version of this file under
35
 * the terms of any one of the MPL, the GPL or the LGPL.
36
 *
37
 * ***** END LICENSE BLOCK ***** */
38
39
#include <stdio.h>
40
#include "prmem.h"
41
42
#include "nsSBCharSetProber.h"
43
#include "nsSBCSGroupProber.h"
44
45
#include "nsHebrewProber.h"
46
47
nsSBCSGroupProber::nsSBCSGroupProber()
48
{
49
  mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model);
50
  mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel);
51
  mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model);
52
  mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
53
  mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
54
  mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
55
  mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
56
  mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
57
  mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
58
  mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
59
  mProbers[10] = new nsSingleByteCharSetProber(&TIS620ThaiModel);
60
61
  nsHebrewProber *hebprober = new nsHebrewProber();
62
  // Notice: Any change in these indexes - 10,11,12 must be reflected
63
  // in the code below as well.
64
  mProbers[11] = hebprober;
65
  mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew
66
  mProbers[13] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew
67
  // Tell the Hebrew prober about the logical and visual probers
68
  if (mProbers[11] && mProbers[12] && mProbers[13]) // all are not null
69
  {
70
    hebprober->SetModelProbers(mProbers[12], mProbers[13]);
71
  }
72
  else // One or more is null. avoid any Hebrew probing, null them all
73
  {
74
    for (PRUint32 i = 11; i <= 13; ++i)
75
    { 
76
      delete mProbers[i]; 
77
      mProbers[i] = 0; 
78
    }
79
  }
80
81
  // disable latin2 before latin1 is available, otherwise all latin1 
82
  // will be detected as latin2 because of their similarity.
83
  //mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
84
  //mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
85
86
  Reset();
87
}
88
89
nsSBCSGroupProber::~nsSBCSGroupProber()
90
{
91
  for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
92
  {
93
    delete mProbers[i];
94
  }
95
}
96
97
98
const char* nsSBCSGroupProber::GetCharSetName()
99
{
100
  //if we have no answer yet
101
  if (mBestGuess == -1)
102
  {
103
    GetConfidence();
104
    //no charset seems positive
105
    if (mBestGuess == -1)
106
      //we will use default.
107
      mBestGuess = 0;
108
  }
109
  return mProbers[mBestGuess]->GetCharSetName();
110
}
111
112
void  nsSBCSGroupProber::Reset(void)
113
{
114
  mActiveNum = 0;
115
  for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
116
  {
117
    if (mProbers[i]) // not null
118
    {
119
      mProbers[i]->Reset();
120
      mIsActive[i] = PR_TRUE;
121
      ++mActiveNum;
122
    }
123
    else
124
      mIsActive[i] = PR_FALSE;
125
  }
126
  mBestGuess = -1;
127
  mState = eDetecting;
128
}
129
130
131
nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
132
{
133
  nsProbingState st;
134
  PRUint32 i;
135
  char *newBuf1 = 0;
136
  PRUint32 newLen1 = 0;
137
138
  //apply filter to original buffer, and we got new buffer back
139
  //depend on what script it is, we will feed them the new buffer 
140
  //we got after applying proper filter
141
  //this is done without any consideration to KeepEnglishLetters
142
  //of each prober since as of now, there are no probers here which
143
  //recognize languages with English characters.
144
  if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1))
145
    goto done;
146
  
147
  if (newLen1 == 0)
148
    goto done; // Nothing to see here, move on.
149
150
  for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
151
  {
152
     if (!mIsActive[i])
153
       continue;
154
     st = mProbers[i]->HandleData(newBuf1, newLen1);
155
     if (st == eFoundIt)
156
     {
157
       mBestGuess = i;
158
       mState = eFoundIt;
159
       break;
160
     }
161
     else if (st == eNotMe)
162
     {
163
       mIsActive[i] = PR_FALSE;
164
       mActiveNum--;
165
       if (mActiveNum <= 0)
166
       {
167
         mState = eNotMe;
168
         break;
169
       }
170
     }
171
  }
172
173
done:
174
  PR_FREEIF(newBuf1);
175
176
  return mState;
177
}
178
179
float nsSBCSGroupProber::GetConfidence(void)
180
{
181
  PRUint32 i;
182
  float bestConf = 0.0, cf;
183
184
  switch (mState)
185
  {
186
  case eFoundIt:
187
    return (float)0.99; //sure yes
188
  case eNotMe:
189
    return (float)0.01;  //sure no
190
  default:
191
    for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
192
    {
193
      if (!mIsActive[i])
194
        continue;
195
      cf = mProbers[i]->GetConfidence();
196
      if (bestConf < cf)
197
      {
198
        bestConf = cf;
199
        mBestGuess = i;
200
      }
201
    }
202
  }
203
  return bestConf;
204
}
205
206
#ifdef DEBUG_chardet
207
void nsSBCSGroupProber::DumpStatus()
208
{
209
  PRUint32 i;
210
  float cf;
211
  
212
  cf = GetConfidence();
213
  printf(" SBCS Group Prober --------begin status \r\n");
214
  for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
215
  {
216
    if (!mIsActive[i])
217
      printf("  inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName());
218
    else
219
      mProbers[i]->DumpStatus();
220
  }
221
  printf(" SBCS Group found best match [%s] confidence %f.\r\n",  
222
         mProbers[mBestGuess]->GetCharSetName(), cf);
223
}
224
#endif