• Skip to content
  • Skip to link menu
  • KDE API Reference
  • kdelibs-4.14.38 API Reference
  • KDE Home
  • Contact Us
 

KDECore

  • kdecore
  • localization
kencodingprober.cpp
Go to the documentation of this file.
1/*
2 This file is part of the KDE libraries
3
4 Copyright (C) 2008 Wang Hoi (zealot.hoi@gmail.com)
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
15
16 You should have received a copy of the GNU Library General Public License
17 along with this library; see the file COPYING.LIB. If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA.
20
21*/
22
23#include "kencodingprober.h"
24
25#include "klocale.h"
26
27#include "probers/nsCharSetProber.h"
28#include "probers/nsUniversalDetector.h"
29#include "probers/ChineseGroupProber.h"
30#include "probers/JapaneseGroupProber.h"
31#include "probers/UnicodeGroupProber.h"
32#include "probers/nsSBCSGroupProber.h"
33#include "probers/nsMBCSGroupProber.h"
34
35#include <string.h>
36
37class KEncodingProberPrivate
38{
39public:
40 KEncodingProberPrivate(): prober(NULL), mStart(true) {};
41 ~KEncodingProberPrivate()
42 {
43 delete prober;
44 }
45 void setProberType(KEncodingProber::ProberType pType)
46 {
47 proberType = pType;
48 /* handle multi-byte encodings carefully , because they're hard to detect,
49 * and have to use some Stastics methods.
50 * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok,
51 * because encoding state machine can detect many such encodings.
52 */
53
54 delete prober;
55
56 switch (proberType) {
57 case KEncodingProber::None:
58 prober = NULL;
59 break;
60 case KEncodingProber::Arabic:
61 case KEncodingProber::Baltic:
62 case KEncodingProber::CentralEuropean:
63 case KEncodingProber::Cyrillic:
64 case KEncodingProber::Greek:
65 case KEncodingProber::Hebrew:
66 case KEncodingProber::NorthernSaami:
67 case KEncodingProber::Other:
68 case KEncodingProber::SouthEasternEurope:
69 case KEncodingProber::Thai:
70 case KEncodingProber::Turkish:
71 case KEncodingProber::WesternEuropean:
72 prober = new kencodingprober::nsSBCSGroupProber();
73 break;
74 case KEncodingProber::ChineseSimplified:
75 case KEncodingProber::ChineseTraditional:
76 prober = new kencodingprober::ChineseGroupProber();
77 break;
78 case KEncodingProber::Japanese:
79 prober = new kencodingprober::JapaneseGroupProber();
80 break;
81 case KEncodingProber::Korean:
82 prober = new kencodingprober::nsMBCSGroupProber();
83 break;
84 case KEncodingProber::Unicode:
85 prober = new kencodingprober::UnicodeGroupProber();
86 break;
87 case KEncodingProber::Universal:
88 prober = new kencodingprober::nsUniversalDetector();
89 break;
90 default:
91 prober = NULL;
92 }
93 }
94 void unicodeTest(const char *aBuf, int aLen)
95 {
96 if (mStart)
97 {
98 mStart = false;
99 if (aLen > 3)
100 switch (aBuf[0])
101 {
102 case '\xEF':
103 if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
104 // EF BB BF UTF-8 encoded BOM
105 proberState = KEncodingProber::FoundIt;
106 break;
107 case '\xFE':
108 if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
109 // FE FF 00 00 UCS-4, unusual octet order BOM (3412)
110 proberState = KEncodingProber::FoundIt;
111 else if ('\xFF' == aBuf[1])
112 // FE FF UTF-16, big endian BOM
113 proberState = KEncodingProber::FoundIt;
114 break;
115 case '\x00':
116 if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
117 // 00 00 FE FF UTF-32, big-endian BOM
118 proberState = KEncodingProber::FoundIt;
119 else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
120 // 00 00 FF FE UCS-4, unusual octet order BOM (2143)
121 proberState = KEncodingProber::FoundIt;
122 break;
123 case '\xFF':
124 if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
125 // FF FE 00 00 UTF-32, little-endian BOM
126 proberState = KEncodingProber::FoundIt;
127 else if ('\xFE' == aBuf[1])
128 // FF FE UTF-16, little endian BOM
129 proberState = KEncodingProber::FoundIt;
130 break;
131 } // switch
132
133 }
134 }
135 KEncodingProber::ProberType proberType;
136 KEncodingProber::ProberState proberState;
137 kencodingprober::nsCharSetProber *prober;
138 bool mStart;
139};
140
141KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType): d(new KEncodingProberPrivate())
142{
143 setProberType(proberType);
144}
145
146KEncodingProber::~KEncodingProber()
147{
148 delete d;
149}
150
151void KEncodingProber::reset()
152{
153 d->proberState = KEncodingProber::Probing;
154 d->mStart = true;
155}
156
157KEncodingProber::ProberState KEncodingProber::feed(const QByteArray &data)
158{
159 return feed(data.data(), data.size());
160}
161
162KEncodingProber::ProberState KEncodingProber::feed(const char* data, int len)
163{
164 if (!d->prober)
165 return d->proberState;
166 if (d->proberState == Probing) {
167 if (d->mStart) {
168 d->unicodeTest(data, len);
169 if (d->proberState == FoundIt)
170 return d->proberState;
171 }
172 d->prober->HandleData(data, len);
173 switch (d->prober->GetState())
174 {
175 case kencodingprober::eNotMe:
176 d->proberState = NotMe;
177 break;
178 case kencodingprober::eFoundIt:
179 d->proberState = FoundIt;
180 break;
181 default:
182 d->proberState = Probing;
183 break;
184 }
185 }
186#ifdef DEBUG_PROBE
187 d->prober->DumpStatus();
188#endif
189 return d->proberState;
190}
191
192KEncodingProber::ProberState KEncodingProber::state() const
193{
194 return d->proberState;
195}
196
197//DEPRECATED, do *not* use
198#ifndef KDE_NO_DEPRECATED
199const char* KEncodingProber::encodingName() const
200{
201 return qstrdup(encoding().constData());
202}
203#endif
204
205QByteArray KEncodingProber::encoding() const
206{
207 if (!d->prober)
208 return QByteArray("UTF-8");
209
210 return QByteArray(d->prober->GetCharSetName());
211}
212
213float KEncodingProber::confidence() const
214{
215 if (!d->prober)
216 return 0.0;
217
218 return d->prober->GetConfidence();
219}
220
221KEncodingProber::ProberType KEncodingProber::proberType() const
222{
223 return d->proberType;
224}
225
226void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
227{
228 d->setProberType(proberType);
229 reset();
230}
231
232KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString& lang)
233{
234 if (lang.isEmpty())
235 return KEncodingProber::Universal;
236 else if (lang==i18nc("@item Text character set", "Disabled"))
237 return KEncodingProber::None;
238 else if (lang==i18nc("@item Text character set", "Universal"))
239 return KEncodingProber::Universal;
240 else if (lang==i18nc("@item Text character set", "Unicode"))
241 return KEncodingProber::Unicode;
242 else if (lang==i18nc("@item Text character set", "Cyrillic"))
243 return KEncodingProber::Cyrillic;
244 else if (lang==i18nc("@item Text character set", "Western European"))
245 return KEncodingProber::WesternEuropean;
246 else if (lang==i18nc("@item Text character set", "Central European"))
247 return KEncodingProber::CentralEuropean;
248 else if (lang==i18nc("@item Text character set", "Greek"))
249 return KEncodingProber::Greek;
250 else if (lang==i18nc("@item Text character set", "Hebrew"))
251 return KEncodingProber::Hebrew;
252 else if (lang==i18nc("@item Text character set", "Turkish"))
253 return KEncodingProber::Turkish;
254 else if (lang==i18nc("@item Text character set", "Japanese"))
255 return KEncodingProber::Japanese;
256 else if (lang==i18nc("@item Text character set", "Baltic"))
257 return KEncodingProber::Baltic;
258 else if (lang==i18nc("@item Text character set", "Chinese Traditional"))
259 return KEncodingProber::ChineseTraditional;
260 else if (lang==i18nc("@item Text character set", "Chinese Simplified"))
261 return KEncodingProber::ChineseSimplified;
262 else if (lang==i18nc("@item Text character set", "Arabic"))
263 return KEncodingProber::Arabic;
264
265 return KEncodingProber::Universal;
266}
267
268QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
269{
270 switch (proberType)
271 {
272 case KEncodingProber::None:
273 return i18nc("@item Text character set", "Disabled");
274 break;
275 case KEncodingProber::Universal:
276 return i18nc("@item Text character set", "Universal");
277 break;
278 case KEncodingProber::Arabic:
279 return i18nc("@item Text character set", "Arabic");
280 break;
281 case KEncodingProber::Baltic:
282 return i18nc("@item Text character set", "Baltic");
283 break;
284 case KEncodingProber::CentralEuropean:
285 return i18nc("@item Text character set", "Central European");
286 break;
287 case KEncodingProber::Cyrillic:
288 return i18nc("@item Text character set", "Cyrillic");
289 break;
290 case KEncodingProber::Greek:
291 return i18nc("@item Text character set", "Greek");
292 break;
293 case KEncodingProber::Hebrew:
294 return i18nc("@item Text character set", "Hebrew");
295 break;
296 case KEncodingProber::Japanese:
297 return i18nc("@item Text character set", "Japanese");
298 break;
299 case KEncodingProber::Turkish:
300 return i18nc("@item Text character set", "Turkish");
301 break;
302 case KEncodingProber::WesternEuropean:
303 return i18nc("@item Text character set", "Western European");
304 break;
305 case KEncodingProber::ChineseTraditional:
306 return i18nc("@item Text character set", "Chinese Traditional");
307 break;
308 case KEncodingProber::ChineseSimplified:
309 return i18nc("@item Text character set", "Chinese Simplified");
310 break;
311 case KEncodingProber::Korean:
312 return i18nc("@item Text character set", "Korean");
313 break;
314 case KEncodingProber::Thai:
315 return i18nc("@item Text character set", "Thai");
316 break;
317 case KEncodingProber::Unicode:
318 return i18nc("@item Text character set", "Unicode");
319 break;
320 default:
321 return QString();
322 }
323}
ChineseGroupProber.h
JapaneseGroupProber.h
UnicodeGroupProber.h
KEncodingProber::reset
void reset()
reset the prober's internal state and data.
Definition: kencodingprober.cpp:151
KEncodingProber::proberType
ProberType proberType() const
Definition: kencodingprober.cpp:221
KEncodingProber::KEncodingProber
KEncodingProber(ProberType proberType=Universal)
Default ProberType is Universal(detect all possibe encodings)
Definition: kencodingprober.cpp:141
KEncodingProber::encodingName
const char * encodingName() const
Definition: kencodingprober.cpp:199
KEncodingProber::feed
ProberState feed(const QByteArray &data)
The main class method.
Definition: kencodingprober.cpp:157
KEncodingProber::nameForProberType
static QString nameForProberType(ProberType proberType)
map ProberType to language string
Definition: kencodingprober.cpp:268
KEncodingProber::confidence
float confidence() const
Definition: kencodingprober.cpp:213
KEncodingProber::state
ProberState state() const
Definition: kencodingprober.cpp:192
KEncodingProber::setProberType
void setProberType(ProberType proberType)
change current prober's ProberType and reset the prober
Definition: kencodingprober.cpp:226
KEncodingProber::~KEncodingProber
~KEncodingProber()
Definition: kencodingprober.cpp:146
KEncodingProber::proberTypeForName
static ProberType proberTypeForName(const QString &lang)
Definition: kencodingprober.cpp:232
KEncodingProber::encoding
QByteArray encoding() const
Definition: kencodingprober.cpp:205
KEncodingProber::ProberState
ProberState
Definition: kencodingprober.h:71
KEncodingProber::Probing
@ Probing
Need more data to make a decision.
Definition: kencodingprober.h:74
KEncodingProber::NotMe
@ NotMe
Sure not included in current ProberType's all supported encodings
Definition: kencodingprober.h:73
KEncodingProber::FoundIt
@ FoundIt
Sure find the encoding.
Definition: kencodingprober.h:72
KEncodingProber::ProberType
ProberType
Definition: kencodingprober.h:77
KEncodingProber::None
@ None
Definition: kencodingprober.h:78
KEncodingProber::Arabic
@ Arabic
Definition: kencodingprober.h:80
KEncodingProber::Cyrillic
@ Cyrillic
Definition: kencodingprober.h:85
KEncodingProber::Turkish
@ Turkish
Definition: kencodingprober.h:94
KEncodingProber::ChineseSimplified
@ ChineseSimplified
Definition: kencodingprober.h:83
KEncodingProber::Unicode
@ Unicode
Definition: kencodingprober.h:95
KEncodingProber::Universal
@ Universal
Definition: kencodingprober.h:79
KEncodingProber::Korean
@ Korean
Definition: kencodingprober.h:89
KEncodingProber::WesternEuropean
@ WesternEuropean
Definition: kencodingprober.h:96
KEncodingProber::Greek
@ Greek
Definition: kencodingprober.h:86
KEncodingProber::Thai
@ Thai
Definition: kencodingprober.h:93
KEncodingProber::Hebrew
@ Hebrew
Definition: kencodingprober.h:87
KEncodingProber::ChineseTraditional
@ ChineseTraditional
Definition: kencodingprober.h:84
KEncodingProber::CentralEuropean
@ CentralEuropean
Definition: kencodingprober.h:82
KEncodingProber::Japanese
@ Japanese
Definition: kencodingprober.h:88
KEncodingProber::Other
@ Other
Definition: kencodingprober.h:91
KEncodingProber::SouthEasternEurope
@ SouthEasternEurope
Definition: kencodingprober.h:92
KEncodingProber::Baltic
@ Baltic
Definition: kencodingprober.h:81
KEncodingProber::NorthernSaami
@ NorthernSaami
Definition: kencodingprober.h:90
QString
kencodingprober::ChineseGroupProber
Definition: ChineseGroupProber.h:34
kencodingprober::JapaneseGroupProber
Definition: JapaneseGroupProber.h:36
kencodingprober::UnicodeGroupProber
Definition: UnicodeGroupProber.h:34
kencodingprober::nsCharSetProber
Definition: nsCharSetProber.h:42
kencodingprober::nsMBCSGroupProber
Definition: nsMBCSGroupProber.h:38
kencodingprober::nsSBCSGroupProber
Definition: nsSBCSGroupProber.h:47
kencodingprober::nsUniversalDetector
Definition: nsUniversalDetector.h:40
kencodingprober.h
klocale.h
i18nc
QString i18nc(const char *ctxt, const char *text)
Returns a localized version of a string and a context.
Definition: klocalizedstring.h:797
kencodingprober::eNotMe
@ eNotMe
Definition: nsCharSetProber.h:37
kencodingprober::eFoundIt
@ eFoundIt
Definition: nsCharSetProber.h:36
nsCharSetProber.h
nsMBCSGroupProber.h
nsSBCSGroupProber.h
nsUniversalDetector.h
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Mon Feb 20 2023 00:00:00 by doxygen 1.9.6 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

KDECore

Skip menu "KDECore"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Modules
  • Related Pages

kdelibs-4.14.38 API Reference

Skip menu "kdelibs-4.14.38 API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal