• Skip to content
  • Skip to link menu
  • KDE API Reference
  • kdelibs-4.14.38 API Reference
  • KDE Home
  • Contact Us
 

KDECore

  • kdecore
  • localization
kencodingdetector.cpp
Go to the documentation of this file.
1/*
2 This file is part of the KDE libraries
3
4 Copyright (C) 1999 Lars Knoll (knoll@kde.org)
5 Copyright (C) 2003 Dirk Mueller (mueller@kde.org)
6 Copyright (C) 2003 Apple Computer, Inc.
7 Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
8
9 This library is free software; you can redistribute it and/or
10 modify it under the terms of the GNU Library General Public
11 License as published by the Free Software Foundation; either
12 version 2 of the License, or (at your option) any later version.
13
14 This library is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 Library General Public License for more details.
18
19 You should have received a copy of the GNU Library General Public License
20 along with this library; see the file COPYING.LIB. If not, write to
21 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22 Boston, MA 02110-1301, USA.
23*/
24//----------------------------------------------------------------------------
25//
26// decoder for input stream
27
28#include "kencodingdetector.h"
29
30#undef DECODE_DEBUG
31//#define DECODE_DEBUG
32
33#define MAX_BUFFER 16*1024
34
35#include <assert.h>
36
37#include "guess_ja_p.h"
38
39#include <QRegExp>
40#include <QTextCodec>
41
42#include <kglobal.h>
43#include <kcharsets.h>
44#include <kdebug.h>
45#include <klocale.h>
46
47#include <ctype.h>
48
49enum MIB
50{
51 MibLatin1 = 4,
52 Mib8859_8 = 85,
53 MibUtf8 = 106,
54 MibUcs2 = 1000,
55 MibUtf16 = 1015,
56 MibUtf16BE = 1013,
57 MibUtf16LE = 1014
58};
59
60static bool is16Bit(QTextCodec* codec)
61{
62 switch (codec->mibEnum())
63 {
64 case MibUtf16:
65 case MibUtf16BE:
66 case MibUtf16LE:
67 case MibUcs2:
68 return true;
69 default:
70 return false;
71 }
72}
73
74class KEncodingDetectorPrivate
75{
76public:
77 QTextCodec *m_codec;
78 QTextDecoder *m_decoder; // utf16
79 QTextCodec *m_defaultCodec;
80 QByteArray m_storeDecoderName;
81
82 KEncodingDetector::EncodingChoiceSource m_source;
83 KEncodingDetector::AutoDetectScript m_autoDetectLanguage;
84
85 bool m_visualRTL : 1;
86 bool m_seenBody : 1;
87 bool m_writtingHappened : 1;
88 bool m_analyzeCalled : 1; //for decode()
89 int m_multiByte;
90
91 QByteArray m_bufferForDefferedEncDetection;
92
93 KEncodingDetectorPrivate()
94 : m_codec(QTextCodec::codecForMib(MibLatin1))
95 , m_decoder(m_codec->makeDecoder())
96 , m_defaultCodec(m_codec)
97 , m_source(KEncodingDetector::DefaultEncoding)
98 , m_autoDetectLanguage(KEncodingDetector::SemiautomaticDetection)
99 , m_visualRTL(false)
100 , m_seenBody(false)
101 , m_writtingHappened(false)
102 , m_analyzeCalled(false)
103 , m_multiByte(0)
104 {
105 }
106
107 KEncodingDetectorPrivate(QTextCodec* codec,KEncodingDetector::EncodingChoiceSource source, KEncodingDetector::AutoDetectScript script)
108 : m_codec(codec)
109 , m_decoder(m_codec->makeDecoder())
110 , m_defaultCodec(m_codec)
111 , m_source(source)
112 , m_autoDetectLanguage(script)
113 , m_visualRTL(false)
114 , m_seenBody(false)
115 , m_writtingHappened(false)
116 , m_analyzeCalled(false)
117 , m_multiByte(0)
118 {
119 }
120
121 ~KEncodingDetectorPrivate()
122 {
123 delete m_decoder;
124 }
125
126 // Returns true if the encoding was explicitly specified someplace.
127 bool isExplicitlySpecifiedEncoding()
128 {
129 return m_source != KEncodingDetector::DefaultEncoding && m_source != KEncodingDetector::AutoDetectedEncoding;
130 }
131};
132
133
134static QByteArray automaticDetectionForArabic( const unsigned char* ptr, int size )
135{
136 for ( int i = 0; i < size; ++i ) {
137 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
138 || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )
139 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
140 || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {
141 return "cp1256";
142 }
143 }
144
145 return "iso-8859-6";
146}
147
148static QByteArray automaticDetectionForBaltic( const unsigned char* ptr, int size )
149{
150 for ( int i = 0; i < size; ++i ) {
151 if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )
152 return "cp1257";
153
154 if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )
155 return "iso-8859-13";
156 }
157
158 return "iso-8859-13";
159}
160
161static QByteArray automaticDetectionForCentralEuropean(const unsigned char* ptr, int size )
162{
163 QByteArray charset = QByteArray();
164 for ( int i = 0; i < size; ++i ) {
165 if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {
166 if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )
167 return "ibm852";
168
169 if ( i + 1 > size )
170 return "cp1250";
171 else { // maybe ibm852 ?
172 charset = "cp1250";
173 continue;
174 }
175 }
176 if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {
177 if ( i + 1 > size )
178 return "iso-8859-2";
179 else { // maybe ibm852 ?
180 if ( charset.isNull() )
181 charset = "iso-8859-2";
182 continue;
183 }
184 }
185 }
186
187 if ( charset.isNull() )
188 charset = "iso-8859-3";
189
190 return charset.data();
191}
192
193static QByteArray automaticDetectionForCyrillic( const unsigned char* ptr, int size)
194{
195#ifdef DECODE_DEBUG
196 kWarning() << "KEncodingDetector: Cyr heuristics";
197#endif
198
199// if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf)
200// return "utf8";
201 int utf8_mark=0;
202 int koi_score=0;
203 int cp1251_score=0;
204
205 int koi_st=0;
206 int cp1251_st=0;
207
208// int koi_na=0;
209// int cp1251_na=0;
210
211 int koi_o_capital=0;
212 int koi_o=0;
213 int cp1251_o_capital=0;
214 int cp1251_o=0;
215
216 int koi_a_capital=0;
217 int koi_a=0;
218 int cp1251_a_capital=0;
219 int cp1251_a=0;
220
221 int koi_s_capital=0;
222 int koi_s=0;
223 int cp1251_s_capital=0;
224 int cp1251_s=0;
225
226 int koi_i_capital=0;
227 int koi_i=0;
228 int cp1251_i_capital=0;
229 int cp1251_i=0;
230
231 int cp1251_small_range=0;
232 int koi_small_range=0;
233 int ibm866_small_range=0;
234
235 int i;
236 for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i)
237 {
238 if (ptr[i]>0xdf)
239 {
240 ++cp1251_small_range;
241
242 if (ptr[i]==0xee)//small o
243 ++cp1251_o;
244 else if (ptr[i]==0xe0)//small a
245 ++cp1251_a;
246 else if (ptr[i]==0xe8)//small i
247 ++cp1251_i;
248 else if (ptr[i]==0xf1)//small s
249 ++cp1251_s;
250 else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st
251 ++cp1251_st;
252
253 else if (ptr[i]==0xef)
254 ++koi_o_capital;
255 else if (ptr[i]==0xe1)
256 ++koi_a_capital;
257 else if (ptr[i]==0xe9)
258 ++koi_i_capital;
259 else if (ptr[i]==0xf3)
260 ++koi_s_capital;
261
262 }
263 else if (ptr[i]>0xbf)
264 {
265 ++koi_small_range;
266
267 if (ptr[i]==0xd0||ptr[i]==0xd1)//small o
268 ++utf8_mark;
269 else if (ptr[i]==0xcf)//small o
270 ++koi_o;
271 else if (ptr[i]==0xc1)//small a
272 ++koi_a;
273 else if (ptr[i]==0xc9)//small i
274 ++koi_i;
275 else if (ptr[i]==0xd3)//small s
276 ++koi_s;
277 else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st
278 ++koi_st;
279
280 else if (ptr[i]==0xce)
281 ++cp1251_o_capital;
282 else if (ptr[i]==0xc0)
283 ++cp1251_a_capital;
284 else if (ptr[i]==0xc8)
285 ++cp1251_i_capital;
286 else if (ptr[i]==0xd1)
287 ++cp1251_s_capital;
288 }
289 else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60%
290 ++ibm866_small_range;
291
292 }
293
294 //cannot decide?
295 if (cp1251_small_range+koi_small_range+ibm866_small_range<8)
296 {
297 return "";
298 }
299
300 if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range)
301 {
302#ifdef DECODE_DEBUG
303 kWarning() << "Cyr Enc Detection: UTF8";
304#endif
305 return "UTF-8";
306 }
307
308 if (ibm866_small_range>cp1251_small_range+koi_small_range)
309 return "ibm866";
310
311// QByteArray koi_string = "koi8-u";
312// QByteArray cp1251_string = "cp1251";
313
314 if (cp1251_st==0 && koi_st>1)
315 koi_score+=10;
316 else if (koi_st==0 && cp1251_st>1)
317 cp1251_score+=10;
318
319 if (cp1251_st && koi_st)
320 {
321 if (cp1251_st/koi_st>2)
322 cp1251_score+=20;
323 else if (koi_st/cp1251_st>2)
324 koi_score+=20;
325 }
326
327 if (cp1251_a>koi_a)
328 cp1251_score+=10;
329 else if (cp1251_a || koi_a)
330 koi_score+=10;
331
332 if (cp1251_o>koi_o)
333 cp1251_score+=10;
334 else if (cp1251_o || koi_o)
335 koi_score+=10;
336
337 if (cp1251_i>koi_i)
338 cp1251_score+=10;
339 else if (cp1251_i || koi_i)
340 koi_score+=10;
341
342 if (cp1251_s>koi_s)
343 cp1251_score+=10;
344 else if (cp1251_s || koi_s)
345 koi_score+=10;
346
347 if (cp1251_a_capital>koi_a_capital)
348 cp1251_score+=9;
349 else if (cp1251_a_capital || koi_a_capital)
350 koi_score+=9;
351
352 if (cp1251_o_capital>koi_o_capital)
353 cp1251_score+=9;
354 else if (cp1251_o_capital || koi_o_capital)
355 koi_score+=9;
356
357 if (cp1251_i_capital>koi_i_capital)
358 cp1251_score+=9;
359 else if (cp1251_i_capital || koi_i_capital)
360 koi_score+=9;
361
362 if (cp1251_s_capital>koi_s_capital)
363 cp1251_score+=9;
364 else if (cp1251_s_capital || koi_s_capital)
365 koi_score+=9;
366#ifdef DECODE_DEBUG
367 kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score;
368#endif
369 if (abs(koi_score-cp1251_score)<10)
370 {
371 //fallback...
372 cp1251_score=cp1251_small_range;
373 koi_score=koi_small_range;
374 }
375 if (cp1251_score>koi_score)
376 return "cp1251";
377 else
378 return "koi8-u";
379
380
381// if (cp1251_score>koi_score)
382// setEncoding("cp1251",AutoDetectedEncoding);
383// else
384// setEncoding("koi8-u",AutoDetectedEncoding);
385// return true;
386
387}
388
389static QByteArray automaticDetectionForGreek( const unsigned char* ptr, int size )
390{
391 for ( int i = 0; i < size; ++i ) {
392 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
393 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
394 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {
395 return "cp1253";
396 }
397 }
398
399 return "iso-8859-7";
400}
401
402static QByteArray automaticDetectionForHebrew( const unsigned char* ptr, int size )
403{
404 for ( int i = 0; i < size; ++i ) {
405 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B
406 || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )
407 || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {
408 return "cp1255";
409 }
410
411 if ( ptr[ i ] == 0xDF )
412 return "iso-8859-8-i";
413 }
414
415 return "iso-8859-8-i";
416}
417
418static QByteArray automaticDetectionForJapanese( const unsigned char* ptr, int size )
419{
420 JapaneseCode kc;
421
422 switch ( kc.guess_jp( (const char*)ptr, size ) ) {
423 case JapaneseCode::JIS:
424 return "jis7";
425 case JapaneseCode::EUC:
426 return "eucjp";
427 case JapaneseCode::SJIS:
428 return "sjis";
429 case JapaneseCode::UTF8:
430 return "utf8";
431 default:
432 break;
433 }
434
435 return "";
436}
437
438static QByteArray automaticDetectionForTurkish( const unsigned char* ptr, int size )
439{
440 for ( int i = 0; i < size; ++i ) {
441 if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {
442 return "cp1254";
443 }
444 }
445
446 return "iso-8859-9";
447}
448
449static QByteArray automaticDetectionForWesternEuropean( const unsigned char* ptr, int size )
450{
451 --size;
452 uint nonansi_count=0;
453 for (int i=0; i<size; ++i)
454 {
455 if (ptr[i]>0x79)
456 {
457 ++nonansi_count;
458 if ( ptr[i]>0xc1 && ptr[i]<0xf0 && ptr[i+1]>0x7f && ptr[i+1]<0xc0)
459 {
460 return "UTF-8";
461 }
462 if (ptr[i] >= 0x78 && ptr[i]<=0x9F )
463 {
464 return "cp1252";
465 }
466 }
467
468 }
469
470 if (nonansi_count>0)
471 return "iso-8859-15";
472
473 return "";
474}
475
476// Other browsers allow comments in the head section, so we need to also.
477// It's important not to look for tags inside the comments.
478static void skipComment(const char *&ptr, const char *pEnd)
479{
480 const char *p = ptr;
481 // Allow <!-->; other browsers do.
482 if (*p=='>')
483 {
484 p++;
485 }
486 else
487 {
488 while (p!=pEnd)
489 {
490 if (*p=='-')
491 {
492 // This is the real end of comment, "-->".
493 if (p[1]=='-' && p[2]=='>')
494 {
495 p += 3;
496 break;
497 }
498 // This is the incorrect end of comment that other browsers allow, "--!>".
499 if (p[1] == '-' && p[2] == '!' && p[3] == '>')
500 {
501 p += 4;
502 break;
503 }
504 }
505 p++;
506 }
507 }
508 ptr=p;
509}
510
511// Returns the position of the encoding string.
512static int findXMLEncoding(const QByteArray &str, int &encodingLength)
513{
514 int len = str.length();
515 int pos = str.indexOf("encoding");
516 if (pos == -1)
517 return -1;
518 pos += 8;
519
520 // Skip spaces and stray control characters.
521 while (pos<len && str[pos]<=' ')
522 ++pos;
523
524 //Bail out if nothing after
525 // Skip equals sign.
526 if (pos>=len || str[pos] != '=')
527 return -1;
528 ++pos;
529
530 // Skip spaces and stray control characters.
531 while (pos<len && str[pos]<=' ')
532 ++pos;
533
534 //Bail out if nothing after
535 if (pos >= len)
536 return -1;
537
538 // Skip quotation mark.
539 char quoteMark = str[pos];
540 if (quoteMark != '"' && quoteMark != '\'')
541 return -1;
542 ++pos;
543
544 // Find the trailing quotation mark.
545 int end=pos;
546 while (end<len && str[end]!=quoteMark)
547 ++end;
548
549 if (end>=len)
550 return -1;
551
552 encodingLength = end-pos;
553 return pos;
554}
555
556bool KEncodingDetector::processNull(char *data, int len)
557{
558 bool bin=false;
559 if(is16Bit(d->m_codec))
560 {
561 for (int i=1; i < len; i+=2)
562 {
563 if ((data[i]=='\0') && (data[i-1]=='\0'))
564 {
565 bin=true;
566 data[i]=' ';
567 }
568 }
569 return bin;
570 }
571 // replace '\0' by spaces, for buggy pages
572 int i = len-1;
573 while(--i>=0)
574 {
575 if(data[i]==0)
576 {
577 bin=true;
578 data[i]=' ';
579 }
580 }
581 return bin;
582}
583
584
585bool KEncodingDetector::errorsIfUtf8 (const char* data, int length)
586{
587 if (d->m_codec->mibEnum()!=MibUtf8)
588 return false; //means no errors
589// #define highest1Bits (unsigned char)0x80
590// #define highest2Bits (unsigned char)0xC0
591// #define highest3Bits (unsigned char)0xE0
592// #define highest4Bits (unsigned char)0xF0
593// #define highest5Bits (unsigned char)0xF8
594static const unsigned char highest1Bits = 0x80;
595static const unsigned char highest2Bits = 0xC0;
596static const unsigned char highest3Bits = 0xE0;
597static const unsigned char highest4Bits = 0xF0;
598static const unsigned char highest5Bits = 0xF8;
599
600 for (int i=0; i<length; ++i)
601 {
602 unsigned char c = data[i];
603
604 if (d->m_multiByte>0)
605 {
606 if ((c & highest2Bits) == 0x80)
607 {
608 --(d->m_multiByte);
609 continue;
610 }
611#ifdef DECODE_DEBUG
612 kWarning() << "EncDetector: Broken UTF8";
613#endif
614 return true;
615 }
616
617 // most significant bit zero, single char
618 if ((c & highest1Bits) == 0x00)
619 continue;
620
621 // 110xxxxx => init 1 following bytes
622 if ((c & highest3Bits) == 0xC0)
623 {
624 d->m_multiByte = 1;
625 continue;
626 }
627
628 // 1110xxxx => init 2 following bytes
629 if ((c & highest4Bits) == 0xE0)
630 {
631 d->m_multiByte = 2;
632 continue;
633 }
634
635 // 11110xxx => init 3 following bytes
636 if ((c & highest5Bits) == 0xF0)
637 {
638 d->m_multiByte = 3;
639 continue;
640 }
641#ifdef DECODE_DEBUG
642 kWarning() << "EncDetector:_Broken UTF8";
643#endif
644 return true;
645 }
646 return false;
647}
648
649
650KEncodingDetector::KEncodingDetector() : d(new KEncodingDetectorPrivate)
651{
652}
653
654KEncodingDetector::KEncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) :
655 d(new KEncodingDetectorPrivate(codec,source,script))
656{
657}
658
659KEncodingDetector::~KEncodingDetector()
660{
661 delete d;
662}
663
664void KEncodingDetector::setAutoDetectLanguage( KEncodingDetector::AutoDetectScript lang)
665{
666 d->m_autoDetectLanguage=lang;
667}
668KEncodingDetector::AutoDetectScript KEncodingDetector::autoDetectLanguage() const
669{
670 return d->m_autoDetectLanguage;
671}
672
673KEncodingDetector::EncodingChoiceSource KEncodingDetector::encodingChoiceSource() const
674{
675 return d->m_source;
676}
677
678const char* KEncodingDetector::encoding() const
679{
680 d->m_storeDecoderName = d->m_codec->name();
681 return d->m_storeDecoderName.constData();
682}
683
684bool KEncodingDetector::visuallyOrdered() const
685{
686 return d->m_visualRTL;
687}
688
689// const QTextCodec* KEncodingDetector::codec() const
690// {
691// return d->m_codec;
692// }
693
694QTextDecoder* KEncodingDetector::decoder()
695{
696 return d->m_decoder;
697}
698
699void KEncodingDetector::resetDecoder()
700{
701 assert(d->m_defaultCodec);
702 d->m_bufferForDefferedEncDetection.clear();
703 d->m_writtingHappened = false;
704 d->m_analyzeCalled = false;
705 d->m_multiByte = 0;
706 delete d->m_decoder;
707 if (!d->m_codec)
708 d->m_codec = d->m_defaultCodec;
709 d->m_decoder = d->m_codec->makeDecoder();
710}
711
712bool KEncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
713{
714 QTextCodec *codec;
715 QByteArray enc(_encoding);
716 if(/*enc.isNull() || */enc.isEmpty())
717 {
718 if (type==DefaultEncoding)
719 codec=d->m_defaultCodec;
720 else
721 return false;
722 }
723 else
724 {
725 //QString->QTextCodec
726
727 enc = enc.toLower();
728 // hebrew visually ordered
729 if(enc=="visual")
730 enc="iso8859-8";
731 bool b;
732 codec = KGlobal::charsets()->codecForName(QLatin1String(enc), b);
733 if (!b)
734 return false;
735 }
736
737 if (d->m_codec->mibEnum()==codec->mibEnum())
738 {
739 // We already have the codec, but we still want to re-set the type,
740 // as we may have overwritten a default with a detected
741 d->m_source = type;
742 return true;
743 }
744
745 if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
746 {
747 //Sometimes the codec specified is absurd, i.e. UTF-16 despite
748 //us decoding a meta tag as ASCII. In that case, ignore it.
749 return false;
750 }
751
752 if (codec->mibEnum() == Mib8859_8)
753 {
754 //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself.
755 codec = QTextCodec::codecForName("iso8859-8-i");
756
757 // visually ordered unless one of the following
758 if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical"))
759 d->m_visualRTL = true;
760 }
761
762 d->m_codec = codec;
763 d->m_source = type;
764 delete d->m_decoder;
765 d->m_decoder = d->m_codec->makeDecoder();
766#ifdef DECODE_DEBUG
767 kDebug(6005) << "KEncodingDetector::encoding used is" << d->m_codec->name();
768#endif
769 return true;
770}
771
772QString KEncodingDetector::decode(const char *data, int len)
773{
774 processNull(const_cast<char *>(data),len);
775 if (!d->m_analyzeCalled)
776 {
777 analyze(data,len);
778 d->m_analyzeCalled=true;
779 }
780
781 return d->m_decoder->toUnicode(data,len);
782}
783
784QString KEncodingDetector::decode(const QByteArray &data)
785{
786 processNull(const_cast<char *>(data.data()),data.size());
787 if (!d->m_analyzeCalled)
788 {
789 analyze(data.data(),data.size());
790 d->m_analyzeCalled=true;
791 }
792
793 return d->m_decoder->toUnicode(data);
794}
795
796QString KEncodingDetector::decodeWithBuffering(const char *data, int len)
797{
798#ifdef DECODE_DEBUG
799 kWarning() << "KEncodingDetector: decoding "<<len<<" bytes";
800#endif
801 if (d->m_writtingHappened)
802 {
803#ifdef DECODE_DEBUG
804 kWarning() << "KEncodingDetector: d->m_writtingHappened "<< d->m_codec->name();
805#endif
806 processNull(const_cast<char *>(data),len);
807 return d->m_decoder->toUnicode(data, len);
808 }
809 else
810 {
811 if (d->m_bufferForDefferedEncDetection.isEmpty())
812 {
813 // If encoding detection produced something, and we either got to the body or
814 // actually saw the encoding explicitly, we're done.
815 if (analyze(data,len) && (d->m_seenBody || d->isExplicitlySpecifiedEncoding()))
816 {
817#ifdef DECODE_DEBUG
818 kWarning() << "KEncodingDetector: m_writtingHappened first time "<< d->m_codec->name();
819#endif
820 processNull(const_cast<char *>(data),len);
821 d->m_writtingHappened=true;
822 return d->m_decoder->toUnicode(data, len);
823 }
824 else
825 {
826#ifdef DECODE_DEBUG
827 kWarning() << "KEncodingDetector: begin deffer";
828#endif
829 d->m_bufferForDefferedEncDetection=data;
830 }
831 }
832 else
833 {
834 d->m_bufferForDefferedEncDetection+=data;
835 // As above, but also limit the buffer size. We must use the entire buffer here,
836 // since the boundaries might split the meta tag, etc.
837 bool detected = analyze(d->m_bufferForDefferedEncDetection.constData(), d->m_bufferForDefferedEncDetection.length());
838 if ((detected && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) ||
839 d->m_bufferForDefferedEncDetection.length() > MAX_BUFFER)
840 {
841 d->m_writtingHappened=true;
842 d->m_bufferForDefferedEncDetection.replace('\0',' ');
843 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
844 d->m_bufferForDefferedEncDetection.clear();
845#ifdef DECODE_DEBUG
846 kWarning() << "KEncodingDetector: m_writtingHappened in the middle " << d->m_codec->name();
847#endif
848 return result;
849 }
850 }
851 }
852
853 return QString();
854}
855
856bool KEncodingDetector::decodedInvalidCharacters() const
857{
858 return d->m_decoder ? d->m_decoder->hasFailure() : false;
859}
860
861QString KEncodingDetector::flush()
862{
863 if (d->m_bufferForDefferedEncDetection.isEmpty())
864 return QString();
865
866 d->m_bufferForDefferedEncDetection.replace('\0',' ');
867 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
868 d->m_bufferForDefferedEncDetection.clear();
869#ifdef DECODE_DEBUG
870 kWarning() << "KEncodingDetector:flush() "<< d->m_bufferForDefferedEncDetection.length()<<" bytes "<< d->m_codec->name();
871#endif
872 return result;
873}
874
875bool KEncodingDetector::analyze(const char *data, int len)
876{
877 // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
878 // maximumBOMLength = 10
879 // Even if the user has chosen utf16 we still need to auto-detect the endianness
880 if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec)))
881 {
882 // Extract the first three bytes.
883 const uchar *udata = (const uchar *)data;
884 uchar c1 = *udata++;
885 uchar c2 = *udata++;
886 uchar c3 = *udata++;
887
888 // Check for the BOM
889 const char *autoDetectedEncoding;
890 if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE))
891 {
892 autoDetectedEncoding = "UTF-16";
893 }
894 else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
895 {
896 autoDetectedEncoding = "UTF-8";
897 }
898 else if (c1 == 0x00 || c2 == 0x00)
899 {
900 uchar c4 = *udata++;
901 uchar c5 = *udata++;
902 uchar c6 = *udata++;
903 uchar c7 = *udata++;
904 uchar c8 = *udata++;
905 uchar c9 = *udata++;
906 uchar c10 = *udata++;
907
908 int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
909 int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
910 if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0))
911 autoDetectedEncoding = "UTF-16";
912 else
913 autoDetectedEncoding = 0;
914 }
915 else
916 {
917 autoDetectedEncoding = 0;
918 }
919
920 // If we found a BOM, use the encoding it implies.
921 if (autoDetectedEncoding != 0)
922 {
923 d->m_source = BOM;
924 d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
925 assert(d->m_codec);
926 //enc = d->m_codec->name();
927 delete d->m_decoder;
928 d->m_decoder = d->m_codec->makeDecoder();
929#ifdef DECODE_DEBUG
930 kWarning() << "Detection by BOM";
931#endif
932 if (is16Bit(d->m_codec) && c2==0x00)
933 {
934 // utf16LE, we need to put the decoder in LE mode
935 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
936 d->m_decoder->toUnicode(reverseUtf16, 2);
937 }
938 return true;
939 }
940 }
941
942 //exit from routine in case it was called to only detect byte order for utf-16
943 if (d->m_source==UserChosenEncoding)
944 {
945#ifdef DECODE_DEBUG
946 kWarning() << "KEncodingDetector: UserChosenEncoding exit ";
947#endif
948
949 if (errorsIfUtf8(data, len))
950 setEncoding("",DefaultEncoding);
951 return true;
952 }
953
954 // HTTP header takes precedence over meta-type stuff
955 if (d->m_source==EncodingFromHTTPHeader)
956 return true;
957
958 if (!d->m_seenBody)
959 {
960 // we still don't have an encoding, and are in the head
961 // the following tags are allowed in <head>:
962 // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
963 const char *ptr = data;
964 const char *pEnd = data+len;
965
966 while(ptr != pEnd)
967 {
968 if(*ptr!='<')
969 {
970 ++ptr;
971 continue;
972 }
973 ++ptr;
974 // Handle comments.
975 if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-')
976 {
977 ptr += 3;
978 skipComment(ptr, pEnd);
979 continue;
980 }
981
982 // Handle XML header, which can have encoding in it.
983 if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l')
984 {
985 const char *end = ptr;
986 while (*end != '>' && end < pEnd)
987 end++;
988 if (*end == '\0' || end == pEnd)
989 break;
990 QByteArray str(ptr, end - ptr); // qbytearray provides the \0 terminator
991 int length;
992 int pos = findXMLEncoding(str, length);
993 // also handles the case when specified encoding aint correct
994 if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader))
995 {
996 return true;
997 }
998 }
999
1000 //look for <meta>, stop if we reach <body>
1001 while (
1002 !(((*ptr >= 'a') && (*ptr <= 'z')) ||
1003 ((*ptr >= 'A') && (*ptr <= 'Z')))
1004 && ptr < pEnd
1005 )
1006 ++ptr;
1007
1008 char tmp[5];
1009 int length=0;
1010 const char* max=ptr+4;
1011 if (pEnd<max)
1012 max=pEnd;
1013 while (
1014 (((*ptr >= 'a') && (*ptr <= 'z')) ||
1015 ((*ptr >= 'A') && (*ptr <= 'Z')) ||
1016 ((*ptr >= '0') && (*ptr <= '9')))
1017 && ptr < max
1018 )
1019 {
1020 tmp[length] = tolower( *ptr );
1021 ++ptr;
1022 ++length;
1023 }
1024 tmp[length] = 0;
1025 if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a')
1026 {
1027 // found a meta tag...
1028 const char* end = ptr;
1029 while(*end != '>' && *end != '\0' && end<pEnd)
1030 end++;
1031 //if ( *end == '\0' ) break;
1032 QByteArray str( ptr, (end-ptr)+1);
1033 str = str.toLower();
1034 const int strLength = str.length();
1035 int pos=0;
1036 //if( (pos = str.find("http-equiv", pos)) == -1) break;
1037 //if( (pos = str.find("content-type", pos)) == -1) break;
1038 if( (pos = str.indexOf("charset")) == -1)
1039 continue;
1040 pos+=6;
1041 // skip to '='
1042 if( (pos = str.indexOf("=", pos)) == -1)
1043 continue;
1044
1045 // skip '='
1046 ++pos;
1047
1048 // skip whitespace before encoding itself
1049 while (pos < strLength && str[pos] <= ' ')
1050 ++pos;
1051
1052 // there may also be an opening quote, if this is a charset= and not a http-equiv.
1053 if (pos < strLength && (str[pos] == '"' || str[pos] == '\''))
1054 ++pos;
1055
1056 // skip whitespace
1057 while (pos < strLength && str[pos] <= ' ')
1058 ++pos;
1059
1060 if ( pos == strLength)
1061 continue;
1062
1063 int endpos = pos;
1064 while( endpos < strLength &&
1065 (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
1066 && str[endpos] != ';' && str[endpos] != '>') )
1067 ++endpos;
1068 #ifdef DECODE_DEBUG
1069 kDebug( 6005 ) << "KEncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data();
1070 #endif
1071 if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag))
1072 return true;
1073 }
1074 else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y')
1075 {
1076 d->m_seenBody=true;
1077 break;
1078 }
1079 }
1080 }
1081
1082 if (len<20)
1083 return false;
1084
1085#ifdef DECODE_DEBUG
1086 kDebug( 6005 ) << "KEncodingDetector: using heuristics (" << strlen(data) << ")";
1087#endif
1088
1089 switch ( d->m_autoDetectLanguage)
1090 {
1091 case KEncodingDetector::Arabic:
1092 return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding);
1093// break;
1094 case KEncodingDetector::Baltic:
1095 return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding);
1096// break;
1097 case KEncodingDetector::CentralEuropean:
1098 return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding);
1099 break;
1100 case KEncodingDetector::Cyrillic:
1101 return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding);
1102// break;
1103 case KEncodingDetector::Greek:
1104 return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding);
1105// break;
1106 case KEncodingDetector::Hebrew:
1107 return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding);
1108// break;
1109 case KEncodingDetector::Japanese:
1110 return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding);
1111// break;
1112 case KEncodingDetector::Turkish:
1113 return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding);
1114// break;
1115 case KEncodingDetector::WesternEuropean:
1116 if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding))
1117 return true;
1118 else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for khtml
1119 {
1120 return setEncoding("iso-8859-15",AutoDetectedEncoding);
1121 }
1122 else //use default provided by eg katepart
1123 {
1124 return setEncoding("",DefaultEncoding);
1125 }
1126// break;
1127 case KEncodingDetector::SemiautomaticDetection:
1128 case KEncodingDetector::ChineseSimplified:
1129 case KEncodingDetector::ChineseTraditional:
1130 case KEncodingDetector::Korean:
1131 case KEncodingDetector::Thai:
1132 case KEncodingDetector::Unicode:
1133 case KEncodingDetector::NorthernSaami:
1134 case KEncodingDetector::SouthEasternEurope:
1135 case KEncodingDetector::None:
1136 // huh. somethings broken in this code ### FIXME
1137 //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
1138 break;
1139 }
1140
1141 return true;
1142}
1143
1144
1145KEncodingDetector::AutoDetectScript KEncodingDetector::scriptForName(const QString& lang)
1146{
1147 if (lang.isEmpty())
1148 return KEncodingDetector::None;
1149 else if (lang==i18nc("@item Text character set", "Unicode"))
1150 return KEncodingDetector::Unicode;
1151 else if (lang==i18nc("@item Text character set", "Cyrillic"))
1152 return KEncodingDetector::Cyrillic;
1153 else if (lang==i18nc("@item Text character set", "Western European"))
1154 return KEncodingDetector::WesternEuropean;
1155 else if (lang==i18nc("@item Text character set", "Central European"))
1156 return KEncodingDetector::CentralEuropean;
1157 else if (lang==i18nc("@item Text character set", "Greek"))
1158 return KEncodingDetector::Greek;
1159 else if (lang==i18nc("@item Text character set", "Hebrew"))
1160 return KEncodingDetector::Hebrew;
1161 else if (lang==i18nc("@item Text character set", "Turkish"))
1162 return KEncodingDetector::Turkish;
1163 else if (lang==i18nc("@item Text character set", "Japanese"))
1164 return KEncodingDetector::Japanese;
1165 else if (lang==i18nc("@item Text character set", "Baltic"))
1166 return KEncodingDetector::Baltic;
1167 else if (lang==i18nc("@item Text character set", "Arabic"))
1168 return KEncodingDetector::Arabic;
1169
1170 return KEncodingDetector::None;
1171}
1172
1173bool KEncodingDetector::hasAutoDetectionForScript(KEncodingDetector::AutoDetectScript script)
1174{
1175 switch (script)
1176 {
1177 case KEncodingDetector::Arabic:
1178 return true;
1179 case KEncodingDetector::Baltic:
1180 return true;
1181 case KEncodingDetector::CentralEuropean:
1182 return true;
1183 case KEncodingDetector::Cyrillic:
1184 return true;
1185 case KEncodingDetector::Greek:
1186 return true;
1187 case KEncodingDetector::Hebrew:
1188 return true;
1189 case KEncodingDetector::Japanese:
1190 return true;
1191 case KEncodingDetector::Turkish:
1192 return true;
1193 case KEncodingDetector::WesternEuropean:
1194 return true;
1195 case KEncodingDetector::ChineseTraditional:
1196 return true;
1197 case KEncodingDetector::ChineseSimplified:
1198 return true;
1199 case KEncodingDetector::Unicode:
1200 return true;
1201 break;
1202 default:
1203 return false;
1204 }
1205}
1206
1207QString KEncodingDetector::nameForScript(KEncodingDetector::AutoDetectScript script)
1208{
1209 switch (script)
1210 {
1211 case KEncodingDetector::Arabic:
1212 return i18nc("@item Text character set", "Arabic");
1213 break;
1214 case KEncodingDetector::Baltic:
1215 return i18nc("@item Text character set", "Baltic");
1216 break;
1217 case KEncodingDetector::CentralEuropean:
1218 return i18nc("@item Text character set", "Central European");
1219 break;
1220 case KEncodingDetector::Cyrillic:
1221 return i18nc("@item Text character set", "Cyrillic");
1222 break;
1223 case KEncodingDetector::Greek:
1224 return i18nc("@item Text character set", "Greek");
1225 break;
1226 case KEncodingDetector::Hebrew:
1227 return i18nc("@item Text character set", "Hebrew");
1228 break;
1229 case KEncodingDetector::Japanese:
1230 return i18nc("@item Text character set", "Japanese");
1231 break;
1232 case KEncodingDetector::Turkish:
1233 return i18nc("@item Text character set", "Turkish");
1234 break;
1235 case KEncodingDetector::WesternEuropean:
1236 return i18nc("@item Text character set", "Western European");
1237 break;
1238 case KEncodingDetector::ChineseTraditional:
1239 return i18nc("@item Text character set", "Chinese Traditional");
1240 break;
1241 case KEncodingDetector::ChineseSimplified:
1242 return i18nc("@item Text character set", "Chinese Simplified");
1243 break;
1244 case KEncodingDetector::Korean:
1245 return i18nc("@item Text character set", "Korean");
1246 break;
1247 case KEncodingDetector::Thai:
1248 return i18nc("@item Text character set", "Thai");
1249 break;
1250 case KEncodingDetector::Unicode:
1251 return i18nc("@item Text character set", "Unicode");
1252 break;
1253 //case KEncodingDetector::SemiautomaticDetection:
1254 default:
1255 return QString();
1256
1257 }
1258}
1259
1260#undef DECODE_DEBUG
1261
KCharsets::codecForName
QTextCodec * codecForName(const QString &name) const
Provided for compatibility.
Definition: kcharsets.cpp:696
KEncodingDetector
Provides encoding detection capabilities.
Definition: kencodingdetector.h:59
KEncodingDetector::scriptForName
static AutoDetectScript scriptForName(const QString &lang)
Takes lang name after it were i18n()'ed.
Definition: kencodingdetector.cpp:1145
KEncodingDetector::decodeWithBuffering
QString decodeWithBuffering(const char *data, int len)
Convenience method that uses buffering.
Definition: kencodingdetector.cpp:796
KEncodingDetector::encoding
const char * encoding() const
Convenience method.
Definition: kencodingdetector.cpp:678
KEncodingDetector::~KEncodingDetector
~KEncodingDetector()
Definition: kencodingdetector.cpp:659
KEncodingDetector::AutoDetectScript
AutoDetectScript
Definition: kencodingdetector.h:73
KEncodingDetector::None
@ None
Definition: kencodingdetector.h:74
KEncodingDetector::Korean
@ Korean
Definition: kencodingdetector.h:85
KEncodingDetector::CentralEuropean
@ CentralEuropean
Definition: kencodingdetector.h:78
KEncodingDetector::Thai
@ Thai
Definition: kencodingdetector.h:88
KEncodingDetector::Japanese
@ Japanese
Definition: kencodingdetector.h:84
KEncodingDetector::SemiautomaticDetection
@ SemiautomaticDetection
Definition: kencodingdetector.h:75
KEncodingDetector::Hebrew
@ Hebrew
Definition: kencodingdetector.h:83
KEncodingDetector::Greek
@ Greek
Definition: kencodingdetector.h:82
KEncodingDetector::Cyrillic
@ Cyrillic
Definition: kencodingdetector.h:81
KEncodingDetector::WesternEuropean
@ WesternEuropean
Definition: kencodingdetector.h:91
KEncodingDetector::Turkish
@ Turkish
Definition: kencodingdetector.h:89
KEncodingDetector::Unicode
@ Unicode
Definition: kencodingdetector.h:90
KEncodingDetector::SouthEasternEurope
@ SouthEasternEurope
Definition: kencodingdetector.h:87
KEncodingDetector::NorthernSaami
@ NorthernSaami
Definition: kencodingdetector.h:86
KEncodingDetector::Arabic
@ Arabic
Definition: kencodingdetector.h:76
KEncodingDetector::ChineseTraditional
@ ChineseTraditional
Definition: kencodingdetector.h:80
KEncodingDetector::ChineseSimplified
@ ChineseSimplified
Definition: kencodingdetector.h:79
KEncodingDetector::Baltic
@ Baltic
Definition: kencodingdetector.h:77
KEncodingDetector::errorsIfUtf8
bool errorsIfUtf8(const char *data, int length)
Check if we are really utf8.
Definition: kencodingdetector.cpp:585
KEncodingDetector::visuallyOrdered
bool visuallyOrdered() const
Definition: kencodingdetector.cpp:684
KEncodingDetector::flush
QString flush()
Convenience method to be used with decodeForHtml.
Definition: kencodingdetector.cpp:861
KEncodingDetector::nameForScript
static QString nameForScript(AutoDetectScript)
Definition: kencodingdetector.cpp:1207
KEncodingDetector::EncodingChoiceSource
EncodingChoiceSource
Definition: kencodingdetector.h:62
KEncodingDetector::BOM
@ BOM
Definition: kencodingdetector.h:65
KEncodingDetector::AutoDetectedEncoding
@ AutoDetectedEncoding
Definition: kencodingdetector.h:64
KEncodingDetector::EncodingFromXMLHeader
@ EncodingFromXMLHeader
Definition: kencodingdetector.h:66
KEncodingDetector::EncodingFromHTTPHeader
@ EncodingFromHTTPHeader
Definition: kencodingdetector.h:68
KEncodingDetector::DefaultEncoding
@ DefaultEncoding
Definition: kencodingdetector.h:63
KEncodingDetector::EncodingFromMetaTag
@ EncodingFromMetaTag
Definition: kencodingdetector.h:67
KEncodingDetector::UserChosenEncoding
@ UserChosenEncoding
Definition: kencodingdetector.h:69
KEncodingDetector::KEncodingDetector
KEncodingDetector()
Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiau...
Definition: kencodingdetector.cpp:650
KEncodingDetector::decode
QString decode(const char *data, int len)
The main class method.
Definition: kencodingdetector.cpp:772
KEncodingDetector::decoder
QTextDecoder * decoder()
Definition: kencodingdetector.cpp:694
KEncodingDetector::hasAutoDetectionForScript
static bool hasAutoDetectionForScript(AutoDetectScript)
Definition: kencodingdetector.cpp:1173
KEncodingDetector::setAutoDetectLanguage
void setAutoDetectLanguage(AutoDetectScript)
Definition: kencodingdetector.cpp:664
KEncodingDetector::decodedInvalidCharacters
bool decodedInvalidCharacters() const
This method checks whether invalid characters were found during a decoding operation.
Definition: kencodingdetector.cpp:856
KEncodingDetector::analyze
bool analyze(const char *data, int len)
Analyze text data.
Definition: kencodingdetector.cpp:875
KEncodingDetector::resetDecoder
void resetDecoder()
Resets the decoder.
Definition: kencodingdetector.cpp:699
KEncodingDetector::encodingChoiceSource
EncodingChoiceSource encodingChoiceSource() const
Definition: kencodingdetector.cpp:673
KEncodingDetector::processNull
bool processNull(char *data, int length)
This nice method will kill all 0 bytes (or double bytes) and remember if this was a binary or not ;)
Definition: kencodingdetector.cpp:556
KEncodingDetector::setEncoding
bool setEncoding(const char *encoding, EncodingChoiceSource type)
Definition: kencodingdetector.cpp:712
KEncodingDetector::autoDetectLanguage
AutoDetectScript autoDetectLanguage() const
Definition: kencodingdetector.cpp:668
QString
khtml::JapaneseCode
Definition: guess_ja_p.h:86
khtml::JapaneseCode::UTF8
@ UTF8
Definition: guess_ja_p.h:88
khtml::JapaneseCode::JIS
@ JIS
Definition: guess_ja_p.h:88
khtml::JapaneseCode::EUC
@ EUC
Definition: guess_ja_p.h:88
khtml::JapaneseCode::SJIS
@ SJIS
Definition: guess_ja_p.h:88
khtml::JapaneseCode::guess_jp
enum Type guess_jp(const char *buf, int buflen)
Definition: guess_ja.cpp:305
kDebug
#define kDebug
Definition: kdebug.h:316
kWarning
#define kWarning
Definition: kdebug.h:322
guess_ja_p.h
kcharsets.h
kdebug.h
automaticDetectionForHebrew
static QByteArray automaticDetectionForHebrew(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:402
skipComment
static void skipComment(const char *&ptr, const char *pEnd)
Definition: kencodingdetector.cpp:478
MAX_BUFFER
#define MAX_BUFFER
Definition: kencodingdetector.cpp:33
is16Bit
static bool is16Bit(QTextCodec *codec)
Definition: kencodingdetector.cpp:60
automaticDetectionForBaltic
static QByteArray automaticDetectionForBaltic(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:148
automaticDetectionForCyrillic
static QByteArray automaticDetectionForCyrillic(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:193
automaticDetectionForGreek
static QByteArray automaticDetectionForGreek(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:389
automaticDetectionForCentralEuropean
static QByteArray automaticDetectionForCentralEuropean(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:161
automaticDetectionForJapanese
static QByteArray automaticDetectionForJapanese(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:418
automaticDetectionForTurkish
static QByteArray automaticDetectionForTurkish(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:438
automaticDetectionForArabic
static QByteArray automaticDetectionForArabic(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:134
MIB
MIB
Definition: kencodingdetector.cpp:50
Mib8859_8
@ Mib8859_8
Definition: kencodingdetector.cpp:52
MibUtf16BE
@ MibUtf16BE
Definition: kencodingdetector.cpp:56
MibUtf16LE
@ MibUtf16LE
Definition: kencodingdetector.cpp:57
MibLatin1
@ MibLatin1
Definition: kencodingdetector.cpp:51
MibUcs2
@ MibUcs2
Definition: kencodingdetector.cpp:54
MibUtf8
@ MibUtf8
Definition: kencodingdetector.cpp:53
MibUtf16
@ MibUtf16
Definition: kencodingdetector.cpp:55
findXMLEncoding
static int findXMLEncoding(const QByteArray &str, int &encodingLength)
Definition: kencodingdetector.cpp:512
automaticDetectionForWesternEuropean
static QByteArray automaticDetectionForWesternEuropean(const unsigned char *ptr, int size)
Definition: kencodingdetector.cpp:449
kencodingdetector.h
kglobal.h
klocale.h
i18nc
QString i18nc(const char *ctxt, const char *text)
Returns a localized version of a string and a context.
Definition: klocalizedstring.h:797
KGlobal::charsets
KCharsets * charsets()
The global charset manager.
Definition: kglobal.cpp:214
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Mon Feb 20 2023 00:00:00 by doxygen 1.9.6 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

KDECore

Skip menu "KDECore"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Modules
  • Related Pages

kdelibs-4.14.38 API Reference

Skip menu "kdelibs-4.14.38 API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal