1
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* ***** BEGIN LICENSE BLOCK *****
3
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4
 *
5
 * The contents of this file are subject to the Mozilla Public License Version
6
 * 1.1 (the "License"); you may not use this file except in compliance with
7
 * the License. You may obtain a copy of the License at
8
 * http://www.mozilla.org/MPL/
9
 *
10
 * Software distributed under the License is distributed on an "AS IS" basis,
11
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12
 * for the specific language governing rights and limitations under the
13
 * License.
14
 *
15
 * The Original Code is Mozilla Universal charset detector code.
16
 *
17
 * The Initial Developer of the Original Code is
18
 * Netscape Communications Corporation.
19
 * Portions created by the Initial Developer are Copyright (C) 2001
20
 * the Initial Developer. All Rights Reserved.
21
 *
22
 * Contributor(s):
23
 *          Shy Shalom <shooshX@gmail.com>
24
 *
25
 * Alternatively, the contents of this file may be used under the terms of
26
 * either the GNU General Public License Version 2 or later (the "GPL"), or
27
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28
 * in which case the provisions of the GPL or the LGPL are applicable instead
29
 * of those above. If you wish to allow use of your version of this file only
30
 * under the terms of either the GPL or the LGPL, and not to allow others to
31
 * use your version of this file under the terms of the MPL, indicate your
32
 * decision by deleting the provisions above and replace them with the notice
33
 * and other provisions required by the GPL or the LGPL. If you do not delete
34
 * the provisions above, a recipient may use your version of this file under
35
 * the terms of any one of the MPL, the GPL or the LGPL.
36
 *
37
 * ***** END LICENSE BLOCK ***** */
38
39
#include "nsLatin1Prober.h"
40
#include "prmem.h"
41
#include <stdio.h>
42
43
#define UDF    0        // undefined
44
#define OTH    1        //other
45
#define ASC    2        // ascii capital letter
46
#define ASS    3        // ascii small letter
47
#define ACV    4        // accent capital vowel
48
#define ACO    5        // accent capital other
49
#define ASV    6        // accent small vowel
50
#define ASO    7        // accent small other
51
#define CLASS_NUM   8    // total classes
52
53
static const unsigned char Latin1_CharToClass[] = 
54
{
55
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 00 - 07
56
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 08 - 0F
57
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 10 - 17
58
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 18 - 1F
59
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 20 - 27
60
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 28 - 2F
61
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 30 - 37
62
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 38 - 3F
63
  OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 40 - 47
64
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 48 - 4F
65
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 50 - 57
66
  ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH,   // 58 - 5F
67
  OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 60 - 67
68
  ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 68 - 6F
69
  ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 70 - 77
70
  ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH,   // 78 - 7F
71
  OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH,   // 80 - 87
72
  OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF,   // 88 - 8F
73
  UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 90 - 97
74
  OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO,   // 98 - 9F
75
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A0 - A7
76
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A8 - AF
77
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B0 - B7
78
  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B8 - BF
79
  ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO,   // C0 - C7
80
  ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV,   // C8 - CF
81
  ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH,   // D0 - D7
82
  ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO,   // D8 - DF
83
  ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO,   // E0 - E7
84
  ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV,   // E8 - EF
85
  ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH,   // F0 - F7
86
  ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO,   // F8 - FF
87
};
88
89
90
/* 0 : illegal 
91
   1 : very unlikely 
92
   2 : normal 
93
   3 : very likely
94
*/
95
static const unsigned char Latin1ClassModel[] = 
96
{
97
/*      UDF OTH ASC ASS ACV ACO ASV ASO  */
98
/*UDF*/  0,  0,  0,  0,  0,  0,  0,  0,
99
/*OTH*/  0,  3,  3,  3,  3,  3,  3,  3,
100
/*ASC*/  0,  3,  3,  3,  3,  3,  3,  3, 
101
/*ASS*/  0,  3,  3,  3,  1,  1,  3,  3,
102
/*ACV*/  0,  3,  3,  3,  1,  2,  1,  2,
103
/*ACO*/  0,  3,  3,  3,  3,  3,  3,  3, 
104
/*ASV*/  0,  3,  1,  3,  1,  1,  1,  3, 
105
/*ASO*/  0,  3,  1,  3,  1,  1,  3,  3,
106
};
107
108
void  nsLatin1Prober::Reset(void)
109
{
110
  mState = eDetecting;
111
  mLastCharClass = OTH;
112
  for (int i = 0; i < FREQ_CAT_NUM; i++)
113
    mFreqCounter[i] = 0;
114
}
115
116
117
nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen)
118
{
119
  char *newBuf1 = 0;
120
  PRUint32 newLen1 = 0;
121
122
  if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
123
    newBuf1 = (char*)aBuf;
124
    newLen1 = aLen;
125
  }
126
  
127
  unsigned char charClass;
128
  unsigned char freq;
129
  for (PRUint32 i = 0; i < newLen1; i++)
130
  {
131
    charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
132
    freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
133
    if (freq == 0) {
134
      mState = eNotMe;
135
      break;
136
    }
137
    mFreqCounter[freq]++;
138
    mLastCharClass = charClass;
139
  }
140
141
  if (newBuf1 != aBuf)
142
    PR_FREEIF(newBuf1);
143
144
  return mState;
145
}
146
147
float nsLatin1Prober::GetConfidence(void)
148
{
149
  if (mState == eNotMe)
150
    return 0.01f;
151
  
152
  float confidence;
153
  PRUint32 total = 0;
154
  for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++)
155
    total += mFreqCounter[i];
156
157
  if(!total)
158
    confidence = 0.0f;
159
  else
160
  {
161
    confidence = mFreqCounter[3]*1.0f / total;
162
    confidence -= mFreqCounter[1]*20.0f/total;
163
  }
164
165
  if (confidence < 0.0f)
166
    confidence = 0.0f;
167
  
168
  // lower the confidence of latin1 so that other more accurate detector 
169
  // can take priority.
170
  confidence *= 0.50f;
171
172
  return confidence;
173
}
174
175
#ifdef DEBUG_chardet
176
void  nsLatin1Prober::DumpStatus()
177
{
178
  printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
179
}
180
#endif