• Skip to content
  • Skip to link menu
  • KDE API Reference
  • kdelibs-4.14.38 API Reference
  • KDE Home
  • Contact Us
 

KDECore

  • kdecore
  • localization
  • probers
CharDistribution.h
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2/* -*- C++ -*-
3 * Copyright (C) 1998 <developer@mozilla.org>
4 *
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26#ifndef CharDistribution_h__
27#define CharDistribution_h__
28
29#include "kdemacros.h"
30
31#define ENOUGH_DATA_THRESHOLD 256
32
33namespace kencodingprober {
34class KDE_NO_EXPORT CharDistributionAnalysis
35{
36public:
37 CharDistributionAnalysis() {Reset();};
38 virtual ~CharDistributionAnalysis() {};
39
40 //feed a block of data and do distribution analysis
41 void HandleData(const char* /* aBuf */, unsigned int /* aLen */) {};
42
43 //Feed a character with known length
44 void HandleOneChar(const char* aStr, unsigned int aCharLen)
45 {
46 int order;
47
48 //we only care about 2-bytes character in our distribution analysis
49 order = (aCharLen == 2) ? GetOrder(aStr) : -1;
50
51 if (order >= 0)
52 {
53 mTotalChars++;
54 //order is valid
55 if ((unsigned int)order < mTableSize)
56 {
57 if (512 > mCharToFreqOrder[order])
58 mFreqChars++;
59 }
60 }
61 };
62
63 //return confidence base on existing data
64 float GetConfidence();
65
66 //Reset analyser, clear any state
67 void Reset(void)
68 {
69 mDone = false;
70 mTotalChars = 0;
71 mFreqChars = 0;
72 };
73
74 //This function is for future extension. Caller can use this function to control
75 //analyser's behavior
76 void SetOpion(){};
77
78 //It is not necessary to receive all data to draw conclusion. For charset detection,
79 // certain amount of data is enough
80 bool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;};
81
82protected:
83 //we do not handle character base on its original encoding string, but
84 //convert this encoding string to a number, here called order.
85 //This allow multiple encoding of a language to share one frequency table
86 virtual int GetOrder(const char* /* str */) {return -1;};
87
88 //If this flag is set to true, detection is done and conclusion has been made
89 bool mDone;
90
91 //The number of characters whose frequency order is less than 512
92 unsigned int mFreqChars;
93
94 //Total character encounted.
95 unsigned int mTotalChars;
96
97 //Mapping table to get frequency order from char order (get from GetOrder())
98 const short *mCharToFreqOrder;
99
100 //Size of above table
101 unsigned int mTableSize;
102
103 //This is a constant value varies from language to language, it is used in
104 //calculating confidence. See my paper for further detail.
105 float mTypicalDistributionRatio;
106};
107
108
109class KDE_NO_EXPORT EUCKRDistributionAnalysis : public CharDistributionAnalysis
110{
111public:
112 EUCKRDistributionAnalysis();
113protected:
114 //for euc-KR encoding, we are interested
115 // first byte range: 0xb0 -- 0xfe
116 // second byte range: 0xa1 -- 0xfe
117 //no validation needed here. State machine has done that
118 int GetOrder(const char* str)
119 { if ((unsigned char)*str >= (unsigned char)0xb0)
120 return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
121 else
122 return -1;
123 };
124};
125
126class KDE_NO_EXPORT GB2312DistributionAnalysis : public CharDistributionAnalysis
127{
128public:
129 GB2312DistributionAnalysis();
130protected:
131 //for GB2312 encoding, we are interested
132 // first byte range: 0xb0 -- 0xfe
133 // second byte range: 0xa1 -- 0xfe
134 //no validation needed here. State machine has done that
135 int GetOrder(const char* str)
136 { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)
137 return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
138 else
139 return -1;
140 };
141};
142
143
144class KDE_NO_EXPORT Big5DistributionAnalysis : public CharDistributionAnalysis
145{
146public:
147 Big5DistributionAnalysis();
148protected:
149 //for big5 encoding, we are interested
150 // first byte range: 0xa4 -- 0xfe
151 // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
152 //no validation needed here. State machine has done that
153 int GetOrder(const char* str)
154 { if ((unsigned char)*str >= (unsigned char)0xa4)
155 if ((unsigned char)str[1] >= (unsigned char)0xa1)
156 return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
157 else
158 return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
159 else
160 return -1;
161 };
162};
163
164class KDE_NO_EXPORT SJISDistributionAnalysis : public CharDistributionAnalysis
165{
166public:
167 SJISDistributionAnalysis();
168protected:
169 //for sjis encoding, we are interested
170 // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
171 // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
172 //no validation needed here. State machine has done that
173 int GetOrder(const char* str)
174 {
175 int order;
176 if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)
177 order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
178 else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)
179 order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
180 else
181 return -1;
182 order += (unsigned char)*(str+1) - 0x40;
183 if ((unsigned char)str[1] > (unsigned char)0x7f)
184 order--;
185 return order;
186 };
187};
188
189class KDE_NO_EXPORT EUCJPDistributionAnalysis : public CharDistributionAnalysis
190{
191public:
192 EUCJPDistributionAnalysis();
193protected:
194 //for euc-JP encoding, we are interested
195 // first byte range: 0xa0 -- 0xfe
196 // second byte range: 0xa1 -- 0xfe
197 //no validation needed here. State machine has done that
198 int GetOrder(const char* str)
199 { if ((unsigned char)*str >= (unsigned char)0xa0)
200 return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
201 else
202 return -1;
203 };
204};
205}
206#endif //CharDistribution_h__
207
ENOUGH_DATA_THRESHOLD
#define ENOUGH_DATA_THRESHOLD
Definition: CharDistribution.h:31
kencodingprober::Big5DistributionAnalysis
Definition: CharDistribution.h:145
kencodingprober::Big5DistributionAnalysis::GetOrder
int GetOrder(const char *str)
Definition: CharDistribution.h:153
kencodingprober::CharDistributionAnalysis
Definition: CharDistribution.h:35
kencodingprober::CharDistributionAnalysis::SetOpion
void SetOpion()
Definition: CharDistribution.h:76
kencodingprober::CharDistributionAnalysis::HandleOneChar
void HandleOneChar(const char *aStr, unsigned int aCharLen)
Definition: CharDistribution.h:44
kencodingprober::CharDistributionAnalysis::mCharToFreqOrder
const short * mCharToFreqOrder
Definition: CharDistribution.h:98
kencodingprober::CharDistributionAnalysis::HandleData
void HandleData(const char *, unsigned int)
Definition: CharDistribution.h:41
kencodingprober::CharDistributionAnalysis::GetOrder
virtual int GetOrder(const char *)
Definition: CharDistribution.h:86
kencodingprober::CharDistributionAnalysis::mDone
bool mDone
Definition: CharDistribution.h:89
kencodingprober::CharDistributionAnalysis::Reset
void Reset(void)
Definition: CharDistribution.h:67
kencodingprober::CharDistributionAnalysis::CharDistributionAnalysis
CharDistributionAnalysis()
Definition: CharDistribution.h:37
kencodingprober::CharDistributionAnalysis::mFreqChars
unsigned int mFreqChars
Definition: CharDistribution.h:92
kencodingprober::CharDistributionAnalysis::GotEnoughData
bool GotEnoughData()
Definition: CharDistribution.h:80
kencodingprober::CharDistributionAnalysis::~CharDistributionAnalysis
virtual ~CharDistributionAnalysis()
Definition: CharDistribution.h:38
kencodingprober::CharDistributionAnalysis::mTotalChars
unsigned int mTotalChars
Definition: CharDistribution.h:95
kencodingprober::CharDistributionAnalysis::mTypicalDistributionRatio
float mTypicalDistributionRatio
Definition: CharDistribution.h:105
kencodingprober::CharDistributionAnalysis::mTableSize
unsigned int mTableSize
Definition: CharDistribution.h:101
kencodingprober::EUCJPDistributionAnalysis
Definition: CharDistribution.h:190
kencodingprober::EUCJPDistributionAnalysis::GetOrder
int GetOrder(const char *str)
Definition: CharDistribution.h:198
kencodingprober::EUCKRDistributionAnalysis
Definition: CharDistribution.h:110
kencodingprober::EUCKRDistributionAnalysis::GetOrder
int GetOrder(const char *str)
Definition: CharDistribution.h:118
kencodingprober::GB2312DistributionAnalysis
Definition: CharDistribution.h:127
kencodingprober::GB2312DistributionAnalysis::GetOrder
int GetOrder(const char *str)
Definition: CharDistribution.h:135
kencodingprober::SJISDistributionAnalysis
Definition: CharDistribution.h:165
kencodingprober::SJISDistributionAnalysis::GetOrder
int GetOrder(const char *str)
Definition: CharDistribution.h:173
kencodingprober
Definition: CharDistribution.cpp:37
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Mon Feb 20 2023 00:00:00 by doxygen 1.9.6 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

KDECore

Skip menu "KDECore"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Modules
  • Related Pages

kdelibs-4.14.38 API Reference

Skip menu "kdelibs-4.14.38 API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal