• Skip to content
  • Skip to link menu
  • KDE API Reference
  • kdelibs-4.14.38 API Reference
  • KDE Home
  • Contact Us
 

KDECore

  • kdecore
  • localization
  • probers
nsUniversalDetector.cpp
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2/* -*- C++ -*-
3* Copyright (C) 1998 <developer@mozilla.org>
4* Copyright (C) 2008 <wkai@gmail.com>
5*
6* Permission is hereby granted, free of charge, to any person obtaining
7* a copy of this software and associated documentation files (the
8* "Software"), to deal in the Software without restriction, including
9* without limitation the rights to use, copy, modify, merge, publish,
10* distribute, sublicense, and/or sell copies of the Software, and to
11* permit persons to whom the Software is furnished to do so, subject to
12* the following conditions:
13*
14* The above copyright notice and this permission notice shall be included
15* in all copies or substantial portions of the Software.
16*
17* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24*/
25
26#include "nsUniversalDetector.h"
27
28#include "nsMBCSGroupProber.h"
29#include "nsSBCSGroupProber.h"
30#include "nsEscCharsetProber.h"
31#include "nsLatin1Prober.h"
32
33namespace kencodingprober {
34nsUniversalDetector::nsUniversalDetector()
35{
36 mDone = false;
37 mBestGuess = -1; //illegal value as signal
38 mInTag = false;
39 mEscCharSetProber = 0;
40
41 mStart = true;
42 mDetectedCharset = 0;
43 mGotData = false;
44 mInputState = ePureAscii;
45 mLastChar = '\0';
46
47 unsigned int i;
48 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
49 mCharSetProbers[i] = 0;
50}
51
52nsUniversalDetector::~nsUniversalDetector()
53{
54 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
55 delete mCharSetProbers[i];
56 delete mEscCharSetProber;
57}
58
59void
60nsUniversalDetector::Reset()
61{
62 mDone = false;
63 mBestGuess = -1; //illegal value as signal
64 mInTag = false;
65
66 mStart = true;
67 mDetectedCharset = 0;
68 mGotData = false;
69 mInputState = ePureAscii;
70 mLastChar = '\0';
71
72 if (mEscCharSetProber)
73 mEscCharSetProber->Reset();
74
75 unsigned int i;
76 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
77 if (mCharSetProbers[i])
78 mCharSetProbers[i]->Reset();
79}
80
81//---------------------------------------------------------------------
82#define SHORTCUT_THRESHOLD (float)0.95
83#define MINIMUM_THRESHOLD (float)0.20
84
85nsProbingState nsUniversalDetector::HandleData(const char* aBuf, unsigned int aLen)
86{
87 if(mDone)
88 return eFoundIt;
89
90 if (aLen > 0)
91 mGotData = true;
92
93 unsigned int i;
94 for (i = 0; i < aLen; i++)
95 {
96 //other than 0xa0, if every othe character is ascii, the page is ascii
97 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP
98 {
99 //we got a non-ascii byte (high-byte)
100 if (mInputState != eHighbyte)
101 {
102 //adjust state
103 mInputState = eHighbyte;
104
105 //kill mEscCharSetProber if it is active
106 delete mEscCharSetProber;
107 mEscCharSetProber = 0;
108
109 //start multibyte and singlebyte charset prober
110 if (0 == mCharSetProbers[0])
111 mCharSetProbers[0] = new nsMBCSGroupProber;
112 if (0 == mCharSetProbers[1])
113 mCharSetProbers[1] = new nsSBCSGroupProber;
114 if (0 == mCharSetProbers[2])
115 mCharSetProbers[2] = new nsLatin1Prober;
116 }
117 }
118 else
119 {
120 //ok, just pure ascii so far
121 if ( ePureAscii == mInputState &&
122 (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
123 {
124 //found escape character or HZ "~{"
125 mInputState = eEscAscii;
126 }
127
128 mLastChar = aBuf[i];
129 }
130 }
131
132 nsProbingState st = eDetecting;
133 switch (mInputState)
134 {
135 case eEscAscii:
136 if (0 == mEscCharSetProber) {
137 mEscCharSetProber = new nsEscCharSetProber;
138 }
139 st = mEscCharSetProber->HandleData(aBuf, aLen);
140 if (st == eFoundIt)
141 {
142 mDone = true;
143 mDetectedCharset = mEscCharSetProber->GetCharSetName();
144 }
145 break;
146 case eHighbyte:
147 for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i)
148 {
149 st = mCharSetProbers[i]->HandleData(aBuf, aLen);
150 if (st == eFoundIt)
151 {
152 mDone = true;
153 mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
154 }
155 }
156 break;
157
158 default: //pure ascii
159 mDetectedCharset = "UTF-8";
160 }
161 return st;
162}
163
164
165//---------------------------------------------------------------------
166const char* nsUniversalDetector::GetCharSetName()
167{
168 if (mDetectedCharset)
169 return mDetectedCharset;
170 switch (mInputState)
171 {
172 case eHighbyte:
173 {
174 float proberConfidence;
175 float maxProberConfidence = (float)0.0;
176 int maxProber = 0;
177
178 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
179 {
180 proberConfidence = mCharSetProbers[i]->GetConfidence();
181 if (proberConfidence > maxProberConfidence)
182 {
183 maxProberConfidence = proberConfidence;
184 maxProber = i;
185 }
186 }
187 //do not report anything because we are not confident of it, that's in fact a negative answer
188 if (maxProberConfidence > MINIMUM_THRESHOLD)
189 return mCharSetProbers[maxProber]->GetCharSetName();
190 }
191 case eEscAscii:
192 break;
193 default: // pure ascii
194 ;
195 }
196 return "UTF-8";
197
198}
199
200//---------------------------------------------------------------------
201float nsUniversalDetector::GetConfidence()
202{
203 if (!mGotData)
204 {
205 // we haven't got any data yet, return immediately
206 // caller program sometimes call DataEnd before anything has been sent to detector
207 return MINIMUM_THRESHOLD;
208 }
209 if (mDetectedCharset)
210 return 0.99f;
211 switch (mInputState)
212 {
213 case eHighbyte:
214 {
215 float proberConfidence;
216 float maxProberConfidence = (float)0.0;
217 int maxProber = 0;
218
219 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
220 {
221 proberConfidence = mCharSetProbers[i]->GetConfidence();
222 if (proberConfidence > maxProberConfidence)
223 {
224 maxProberConfidence = proberConfidence;
225 maxProber = i;
226 }
227 }
228 //do not report anything because we are not confident of it, that's in fact a negative answer
229 if (maxProberConfidence > MINIMUM_THRESHOLD)
230 return mCharSetProbers[maxProber]->GetConfidence();
231 }
232 case eEscAscii:
233 break;
234 default: // pure ascii
235 ;
236 }
237 return MINIMUM_THRESHOLD;
238}
239
240nsProbingState nsUniversalDetector::GetState()
241{
242 if (mDone)
243 return eFoundIt;
244 else
245 return eDetecting;
246}
247}
248
249
kencodingprober::nsCharSetProber::Reset
virtual void Reset(void)=0
kencodingprober::nsCharSetProber::HandleData
virtual nsProbingState HandleData(const char *aBuf, unsigned int aLen)=0
kencodingprober::nsCharSetProber::GetConfidence
virtual float GetConfidence(void)=0
kencodingprober::nsCharSetProber::GetCharSetName
virtual const char * GetCharSetName()=0
kencodingprober::nsEscCharSetProber
Definition: nsEscCharsetProber.h:34
kencodingprober::nsLatin1Prober
Definition: nsLatin1Prober.h:33
kencodingprober::nsMBCSGroupProber
Definition: nsMBCSGroupProber.h:38
kencodingprober::nsSBCSGroupProber
Definition: nsSBCSGroupProber.h:47
kencodingprober::nsUniversalDetector::GetState
nsProbingState GetState()
Definition: nsUniversalDetector.cpp:240
kencodingprober::nsUniversalDetector::mCharSetProbers
nsCharSetProber * mCharSetProbers[NUM_OF_CHARSET_PROBERS]
Definition: nsUniversalDetector.h:61
kencodingprober::nsUniversalDetector::mLastChar
char mLastChar
Definition: nsUniversalDetector.h:57
kencodingprober::nsUniversalDetector::mDetectedCharset
const char * mDetectedCharset
Definition: nsUniversalDetector.h:58
kencodingprober::nsUniversalDetector::mStart
bool mStart
Definition: nsUniversalDetector.h:55
kencodingprober::nsUniversalDetector::HandleData
nsProbingState HandleData(const char *aBuf, unsigned int aLen)
Definition: nsUniversalDetector.cpp:85
kencodingprober::nsUniversalDetector::GetCharSetName
const char * GetCharSetName()
Definition: nsUniversalDetector.cpp:166
kencodingprober::nsUniversalDetector::mDone
bool mDone
Definition: nsUniversalDetector.h:53
kencodingprober::nsUniversalDetector::mEscCharSetProber
nsCharSetProber * mEscCharSetProber
Definition: nsUniversalDetector.h:62
kencodingprober::nsUniversalDetector::GetConfidence
float GetConfidence(void)
Definition: nsUniversalDetector.cpp:201
kencodingprober::nsUniversalDetector::mBestGuess
int mBestGuess
Definition: nsUniversalDetector.h:59
kencodingprober::nsUniversalDetector::Reset
void Reset(void)
Definition: nsUniversalDetector.cpp:60
kencodingprober::nsUniversalDetector::~nsUniversalDetector
virtual ~nsUniversalDetector()
Definition: nsUniversalDetector.cpp:52
kencodingprober::nsUniversalDetector::mGotData
bool mGotData
Definition: nsUniversalDetector.h:56
kencodingprober::nsUniversalDetector::mInTag
bool mInTag
Definition: nsUniversalDetector.h:54
kencodingprober::nsUniversalDetector::mInputState
nsInputState mInputState
Definition: nsUniversalDetector.h:52
kencodingprober::nsUniversalDetector::nsUniversalDetector
nsUniversalDetector()
Definition: nsUniversalDetector.cpp:34
kencodingprober
Definition: CharDistribution.cpp:37
kencodingprober::nsProbingState
nsProbingState
Definition: nsCharSetProber.h:34
kencodingprober::eFoundIt
@ eFoundIt
Definition: nsCharSetProber.h:36
kencodingprober::eDetecting
@ eDetecting
Definition: nsCharSetProber.h:35
kencodingprober::eEscAscii
@ eEscAscii
Definition: nsUniversalDetector.h:36
kencodingprober::ePureAscii
@ ePureAscii
Definition: nsUniversalDetector.h:35
kencodingprober::eHighbyte
@ eHighbyte
Definition: nsUniversalDetector.h:37
nsEscCharsetProber.h
nsLatin1Prober.h
nsMBCSGroupProber.h
nsSBCSGroupProber.h
MINIMUM_THRESHOLD
#define MINIMUM_THRESHOLD
Definition: nsUniversalDetector.cpp:83
nsUniversalDetector.h
NUM_OF_CHARSET_PROBERS
#define NUM_OF_CHARSET_PROBERS
Definition: nsUniversalDetector.h:31
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Mon Feb 20 2023 00:00:00 by doxygen 1.9.6 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

KDECore

Skip menu "KDECore"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Modules
  • Related Pages

kdelibs-4.14.38 API Reference

Skip menu "kdelibs-4.14.38 API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal