| 1 |
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
| 2 |
/* ***** BEGIN LICENSE BLOCK ***** |
| 3 |
* Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
| 4 |
* |
| 5 |
* The contents of this file are subject to the Mozilla Public License Version |
| 6 |
* 1.1 (the "License"); you may not use this file except in compliance with |
| 7 |
* the License. You may obtain a copy of the License at |
| 8 |
* http://www.mozilla.org/MPL/ |
| 9 |
* |
| 10 |
* Software distributed under the License is distributed on an "AS IS" basis, |
| 11 |
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
| 12 |
* for the specific language governing rights and limitations under the |
| 13 |
* License. |
| 14 |
* |
| 15 |
* The Original Code is Mozilla Universal charset detector code. |
| 16 |
* |
| 17 |
* The Initial Developer of the Original Code is |
| 18 |
* Shy Shalom <shooshX@gmail.com> |
| 19 |
* Portions created by the Initial Developer are Copyright (C) 2005 |
| 20 |
* the Initial Developer. All Rights Reserved. |
| 21 |
* |
| 22 |
* Contributor(s): |
| 23 |
* |
| 24 |
* Alternatively, the contents of this file may be used under the terms of |
| 25 |
* either the GNU General Public License Version 2 or later (the "GPL"), or |
| 26 |
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
| 27 |
* in which case the provisions of the GPL or the LGPL are applicable instead |
| 28 |
* of those above. If you wish to allow use of your version of this file only |
| 29 |
* under the terms of either the GPL or the LGPL, and not to allow others to |
| 30 |
* use your version of this file under the terms of the MPL, indicate your |
| 31 |
* decision by deleting the provisions above and replace them with the notice |
| 32 |
* and other provisions required by the GPL or the LGPL. If you do not delete |
| 33 |
* the provisions above, a recipient may use your version of this file under |
| 34 |
* the terms of any one of the MPL, the GPL or the LGPL. |
| 35 |
* |
| 36 |
* ***** END LICENSE BLOCK ***** */ |
| 37 |
|
| 38 |
#include "nsHebrewProber.h" |
| 39 |
#include <stdio.h> |
| 40 |
|
| 41 |
// windows-1255 / ISO-8859-8 code points of interest |
| 42 |
#define FINAL_KAF ('\xea') |
| 43 |
#define NORMAL_KAF ('\xeb') |
| 44 |
#define FINAL_MEM ('\xed') |
| 45 |
#define NORMAL_MEM ('\xee') |
| 46 |
#define FINAL_NUN ('\xef') |
| 47 |
#define NORMAL_NUN ('\xf0') |
| 48 |
#define FINAL_PE ('\xf3') |
| 49 |
#define NORMAL_PE ('\xf4') |
| 50 |
#define FINAL_TSADI ('\xf5') |
| 51 |
#define NORMAL_TSADI ('\xf6') |
| 52 |
|
| 53 |
// Minimum Visual vs Logical final letter score difference. |
| 54 |
// If the difference is below this, don't rely solely on the final letter score distance. |
| 55 |
#define MIN_FINAL_CHAR_DISTANCE (5) |
| 56 |
|
| 57 |
// Minimum Visual vs Logical model score difference. |
| 58 |
// If the difference is below this, don't rely at all on the model score distance. |
| 59 |
#define MIN_MODEL_DISTANCE (0.01) |
| 60 |
|
| 61 |
#define VISUAL_HEBREW_NAME ("ISO-8859-8") |
| 62 |
#define LOGICAL_HEBREW_NAME ("windows-1255") |
| 63 |
|
| 64 |
PRBool nsHebrewProber::isFinal(char c) |
| 65 |
{ |
| 66 |
return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI)); |
| 67 |
} |
| 68 |
|
| 69 |
PRBool nsHebrewProber::isNonFinal(char c) |
| 70 |
{ |
| 71 |
return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE)); |
| 72 |
// The normal Tsadi is not a good Non-Final letter due to words like |
| 73 |
// 'lechotet' (to chat) containing an apostrophe after the tsadi. This |
| 74 |
// apostrophe is converted to a space in FilterWithoutEnglishLetters causing |
| 75 |
// the Non-Final tsadi to appear at an end of a word even though this is not |
| 76 |
// the case in the original text. |
| 77 |
// The letters Pe and Kaf rarely display a related behavior of not being a |
| 78 |
// good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for |
| 79 |
// example legally end with a Non-Final Pe or Kaf. However, the benefit of |
| 80 |
// these letters as Non-Final letters outweighs the damage since these words |
| 81 |
// are quite rare. |
| 82 |
} |
| 83 |
|
| 84 |
/** HandleData |
| 85 |
* Final letter analysis for logical-visual decision. |
| 86 |
* Look for evidence that the received buffer is either logical Hebrew or |
| 87 |
* visual Hebrew. |
| 88 |
* The following cases are checked: |
| 89 |
* 1) A word longer than 1 letter, ending with a final letter. This is an |
| 90 |
* indication that the text is laid out "naturally" since the final letter |
| 91 |
* really appears at the end. +1 for logical score. |
| 92 |
* 2) A word longer than 1 letter, ending with a Non-Final letter. In normal |
| 93 |
* Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with |
| 94 |
* the Non-Final form of that letter. Exceptions to this rule are mentioned |
| 95 |
* above in isNonFinal(). This is an indication that the text is laid out |
| 96 |
* backwards. +1 for visual score |
| 97 |
* 3) A word longer than 1 letter, starting with a final letter. Final letters |
| 98 |
* should not appear at the beginning of a word. This is an indication that |
| 99 |
* the text is laid out backwards. +1 for visual score. |
| 100 |
* |
| 101 |
* The visual score and logical score are accumulated throughout the text and |
| 102 |
* are finally checked against each other in GetCharSetName(). |
| 103 |
* No checking for final letters in the middle of words is done since that case |
| 104 |
* is not an indication for either Logical or Visual text. |
| 105 |
* |
| 106 |
* The input buffer should not contain any white spaces that are not (' ') |
| 107 |
* or any low-ascii punctuation marks. |
| 108 |
*/ |
| 109 |
nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen) |
| 110 |
{ |
| 111 |
// Both model probers say it's not them. No reason to continue. |
| 112 |
if (GetState() == eNotMe) |
| 113 |
return eNotMe; |
| 114 |
|
| 115 |
const char *curPtr, *endPtr = aBuf+aLen; |
| 116 |
char cur; |
| 117 |
|
| 118 |
for (curPtr = (char*)aBuf; curPtr < endPtr; ++curPtr) |
| 119 |
{ |
| 120 |
cur = *curPtr; |
| 121 |
if (cur == ' ') // We stand on a space - a word just ended |
| 122 |
{ |
| 123 |
if (mBeforePrev != ' ') // *(curPtr-2) was not a space so prev is not a 1 letter word |
| 124 |
{ |
| 125 |
if (isFinal(mPrev)) // case (1) [-2:not space][-1:final letter][cur:space] |
| 126 |
++mFinalCharLogicalScore; |
| 127 |
else if (isNonFinal(mPrev)) // case (2) [-2:not space][-1:Non-Final letter][cur:space] |
| 128 |
++mFinalCharVisualScore; |
| 129 |
} |
| 130 |
} |
| 131 |
else // Not standing on a space |
| 132 |
{ |
| 133 |
if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) // case (3) [-2:space][-1:final letter][cur:not space] |
| 134 |
++mFinalCharVisualScore; |
| 135 |
} |
| 136 |
mBeforePrev = mPrev; |
| 137 |
mPrev = cur; |
| 138 |
} |
| 139 |
|
| 140 |
// Forever detecting, till the end or until both model probers return eNotMe (handled above). |
| 141 |
return eDetecting; |
| 142 |
} |
| 143 |
|
| 144 |
// Make the decision: is it Logical or Visual? |
| 145 |
const char* nsHebrewProber::GetCharSetName() |
| 146 |
{ |
| 147 |
// If the final letter score distance is dominant enough, rely on it. |
| 148 |
PRInt32 finalsub = mFinalCharLogicalScore - mFinalCharVisualScore; |
| 149 |
if (finalsub >= MIN_FINAL_CHAR_DISTANCE) |
| 150 |
return LOGICAL_HEBREW_NAME; |
| 151 |
if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE)) |
| 152 |
return VISUAL_HEBREW_NAME; |
| 153 |
|
| 154 |
// It's not dominant enough, try to rely on the model scores instead. |
| 155 |
float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence(); |
| 156 |
if (modelsub > MIN_MODEL_DISTANCE) |
| 157 |
return LOGICAL_HEBREW_NAME; |
| 158 |
if (modelsub < -(MIN_MODEL_DISTANCE)) |
| 159 |
return VISUAL_HEBREW_NAME; |
| 160 |
|
| 161 |
// Still no good, back to final letter distance, maybe it'll save the day. |
| 162 |
if (finalsub < 0) |
| 163 |
return VISUAL_HEBREW_NAME; |
| 164 |
|
| 165 |
// (finalsub > 0 - Logical) or (don't know what to do) default to Logical. |
| 166 |
return LOGICAL_HEBREW_NAME; |
| 167 |
} |
| 168 |
|
| 169 |
|
| 170 |
void nsHebrewProber::Reset(void) |
| 171 |
{ |
| 172 |
mFinalCharLogicalScore = 0; |
| 173 |
mFinalCharVisualScore = 0; |
| 174 |
|
| 175 |
// mPrev and mBeforePrev are initialized to space in order to simulate a word |
| 176 |
// delimiter at the beginning of the data |
| 177 |
mPrev = ' '; |
| 178 |
mBeforePrev = ' '; |
| 179 |
} |
| 180 |
|
| 181 |
nsProbingState nsHebrewProber::GetState(void) |
| 182 |
{ |
| 183 |
// Remain active as long as any of the model probers are active. |
| 184 |
if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe)) |
| 185 |
return eNotMe; |
| 186 |
return eDetecting; |
| 187 |
} |
| 188 |
|
| 189 |
#ifdef DEBUG_chardet |
| 190 |
void nsHebrewProber::DumpStatus() |
| 191 |
{ |
| 192 |
printf(" HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore); |
| 193 |
} |
| 194 |
#endif |