• Skip to content
  • Skip to link menu
  • KDE API Reference
  • kdelibs-4.14.38 API Reference
  • KDE Home
  • Contact Us
 

KHTML

  • khtml
  • xpath
tokenizer.cpp
Go to the documentation of this file.
1/*
2 * tokenizer.cc - Copyright 2005 Maksim Orlovich <maksim@kde.org>
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25#include "tokenizer.h"
26
27#include "xml/dom_stringimpl.h"
28#include "xml/dom3_xpathimpl.h"
29#include "dom/dom3_xpath.h"
30
31#include <cstdio>
32
33using namespace std;
34
35using namespace DOM;
36using namespace DOM::XPath;
37using namespace khtml;
38using namespace khtml::XPath;
39
40namespace khtml {
41namespace XPath {
42
43struct AxisNameMapping
44{
45 const char *name;
46 Step::AxisType type;
47};
48
49static AxisNameMapping axisNames[] = {
50 { "ancestor", Step::AncestorAxis },
51 { "ancestor-or-self", Step::AncestorOrSelfAxis },
52 { "attribute", Step::AttributeAxis },
53 { "child", Step::ChildAxis },
54 { "descendant", Step::DescendantAxis },
55 { "descendant-or-self", Step::DescendantOrSelfAxis },
56 { "following", Step::FollowingAxis },
57 { "following-sibling", Step::FollowingSiblingAxis },
58 { "namespace", Step::NamespaceAxis },
59 { "parent", Step::ParentAxis },
60 { "preceding", Step::PrecedingAxis },
61 { "preceding-sibling", Step::PrecedingSiblingAxis },
62 { "self", Step::SelfAxis }
63};
64static unsigned int axisNamesCount = sizeof(axisNames) / sizeof(axisNames[0]);
65
66static const char* const nodeTypeNames[] = {
67 "comment",
68 "text",
69 "processing-instruction",
70 "node",
71 0
72};
73
74QHash<QString, Step::AxisType>* Tokenizer::s_axisNamesDict = 0;
75QSet<QString>* Tokenizer::s_nodeTypeNamesDict = 0;
76
77Tokenizer &Tokenizer::self()
78{
79 static Tokenizer instance;
80 return instance;
81}
82
83Tokenizer::XMLCat Tokenizer::charCat(QChar aChar)
84{
85 //### might need to add some special cases from the XML spec.
86
87 if (aChar.unicode() == '_')
88 return NameStart;
89
90 if (aChar.unicode() == '.' || aChar.unicode() == '-')
91 return NameCont;
92
93 switch (aChar.category()) {
94 case QChar::Letter_Lowercase: //Ll
95 case QChar::Letter_Uppercase: //Lu
96 case QChar::Letter_Other: //Lo
97 case QChar::Letter_Titlecase: //Lt
98 case QChar::Number_Letter: //Nl
99 return NameStart;
100
101 case QChar::Mark_SpacingCombining: //Mc
102 case QChar::Mark_Enclosing: //Me
103 case QChar::Mark_NonSpacing: //Mn
104 case QChar::Letter_Modifier: //Lm
105 case QChar::Number_DecimalDigit: //Nd
106 return NameCont;
107
108 default:
109 return NotPartOfName;
110 }
111}
112
113bool Tokenizer::isAxisName(QString name, Step::AxisType *type)
114{
115 if (!s_axisNamesDict) {
116 s_axisNamesDict = new QHash<QString, Step::AxisType>;
117 for (unsigned int p = 0; p < axisNamesCount; ++p)
118 s_axisNamesDict->insert(QLatin1String(axisNames[p].name),
119 axisNames[p].type);
120 }
121
122 QHash<QString, Step::AxisType>::ConstIterator it = s_axisNamesDict->constFind(name);
123 if ( it != s_axisNamesDict->constEnd() ) {
124 *type = *it;
125 }
126 return it != s_axisNamesDict->constEnd();
127}
128
129bool Tokenizer::isNodeTypeName(QString name)
130{
131 if (!s_nodeTypeNamesDict) {
132 s_nodeTypeNamesDict = new QSet<QString>;
133 for (int p = 0; nodeTypeNames[p]; ++p)
134 s_nodeTypeNamesDict->insert(QLatin1String(nodeTypeNames[p]));
135 }
136 return s_nodeTypeNamesDict->contains(name);
137}
138
139/* Returns whether the last parsed token matches the [32] Operator rule
140 * (check http://www.w3.org/TR/xpath#exprlex). Necessary to disambiguate
141 * the tokens.
142 */
143bool Tokenizer::isOperatorContext()
144{
145 if ( m_nextPos == 0 ) {
146 return false;
147 }
148
149 switch ( m_lastTokenType ) {
150 case AND: case OR: case MULOP:
151 case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
152 case EQOP: case RELOP:
153 case '@': case AXISNAME: case '(': case '[':
154 return false;
155 default:
156 return true;
157 }
158}
159
160void Tokenizer::skipWS()
161{
162 while (m_nextPos < m_data.length() && m_data[m_nextPos].isSpace())
163 ++m_nextPos;
164}
165
166Token Tokenizer::makeTokenAndAdvance(int code, int advance)
167{
168 m_nextPos += advance;
169 return Token(code);
170}
171
172Token Tokenizer::makeIntTokenAndAdvance(int code, int val, int advance)
173{
174 m_nextPos += advance;
175 return Token(code, val);
176}
177
178//Returns next char if it's there and interesting, 0 otherwise
179char Tokenizer::peekAheadHelper()
180{
181 if (m_nextPos + 1 >= m_data.length())
182 return 0;
183 QChar next = m_data[m_nextPos + 1];
184 if (next.row() != 0)
185 return 0;
186 else
187 return next.cell();
188}
189
190char Tokenizer::peekCurHelper()
191{
192 if (m_nextPos >= m_data.length())
193 return 0;
194 QChar next = m_data[m_nextPos];
195 if (next.row() != 0)
196 return 0;
197 else
198 return next.cell();
199}
200
201Token Tokenizer::lexString()
202{
203 QChar delimiter = m_data[m_nextPos];
204 int startPos = m_nextPos + 1;
205
206 for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
207 if (m_data[m_nextPos] == delimiter) {
208 QString value = m_data.mid(startPos, m_nextPos - startPos);
209 ++m_nextPos; //Consume the char;
210 return Token(LITERAL, value);
211 }
212 }
213
214 //Ouch, went off the end -- report error
215 return Token(ERROR);
216}
217
218Token Tokenizer::lexNumber()
219{
220 int startPos = m_nextPos;
221 bool seenDot = false;
222
223 //Go until end or a non-digits character
224 for (; m_nextPos < m_data.length(); ++m_nextPos) {
225 QChar aChar = m_data[m_nextPos];
226 if (aChar.row() != 0) break;
227
228 if (aChar.cell() < '0' || aChar.cell() > '9') {
229 if (aChar.cell() == '.' && !seenDot)
230 seenDot = true;
231 else
232 break;
233 }
234 }
235
236 QString value = m_data.mid(startPos, m_nextPos - startPos);
237 return Token(NUMBER, value);
238}
239
240Token Tokenizer::lexNCName()
241{
242 int startPos = m_nextPos;
243 if (m_nextPos < m_data.length() && charCat(m_data[m_nextPos]) == NameStart)
244 {
245 //Keep going until we get a character that's not good for names.
246 for (; m_nextPos < m_data.length(); ++m_nextPos) {
247 if (charCat(m_data[m_nextPos]) == NotPartOfName)
248 break;
249 }
250
251 QString value = m_data.mid(startPos, m_nextPos - startPos);
252 return Token(value);
253 }
254 else
255 return makeTokenAndAdvance(ERROR);
256}
257
258Token Tokenizer::lexQName()
259{
260 Token t1 = lexNCName();
261 if (t1.type == ERROR) return t1;
262 skipWS();
263 //If the next character is :, what we just got it the prefix, if not,
264 //it's the whole thing
265 if (peekAheadHelper() != ':')
266 return t1;
267
268 Token t2 = lexNCName();
269 if (t2.type == ERROR) return t2;
270
271 return Token(t1.value + ":" + t2.value);
272}
273
274Token Tokenizer::nextTokenInternal()
275{
276 skipWS();
277
278 if (m_nextPos >= m_data.length()) {
279 return Token(0);
280 }
281
282 char code = peekCurHelper();
283 switch (code) {
284 case '(': case ')': case '[': case ']':
285 case '@': case ',': case '|':
286 return makeTokenAndAdvance(code);
287 case '\'':
288 case '\"':
289 return lexString();
290 case '0': case '1': case '2': case '3': case '4':
291 case '5': case '6': case '7': case '8': case '9':
292 return lexNumber();
293 case '.': {
294 char next = peekAheadHelper();
295 if (next == '.')
296 return makeTokenAndAdvance(DOTDOT, 2);
297 else if (next >= '0' && next <= '9')
298 return lexNumber();
299 else
300 return makeTokenAndAdvance('.');
301 }
302 case '/':
303 if (peekAheadHelper() == '/')
304 return makeTokenAndAdvance(SLASHSLASH, 2);
305 else
306 return makeTokenAndAdvance('/');
307 case '+':
308 return makeTokenAndAdvance(PLUS);
309 case '-':
310 return makeTokenAndAdvance(MINUS);
311 case '=':
312 return makeIntTokenAndAdvance(EQOP, RelationOp::OP_EQ);
313 case '!':
314 if (peekAheadHelper() == '=')
315 return makeIntTokenAndAdvance(EQOP, RelationOp::OP_NE, 2);
316 else {
317 return Token(ERROR);
318 }
319 case '<':
320 if (peekAheadHelper() == '=')
321 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LE, 2);
322 else
323 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_LT);
324 case '>':
325 if (peekAheadHelper() == '=')
326 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GE, 2);
327 else
328 return makeIntTokenAndAdvance(RELOP, RelationOp::OP_GT);
329 case '*':
330 if (isOperatorContext())
331 return makeIntTokenAndAdvance(MULOP, NumericOp::OP_Mul);
332 else {
333 ++m_nextPos;
334 return Token(NAMETEST, "*");
335 }
336 case '$': {//$ QName
337 m_nextPos++;
338 Token par = lexQName();
339 if (par.type == ERROR)
340 return par;
341 else
342 return Token(VARIABLEREFERENCE, par.value);
343 }
344 }
345
346 Token t1 = lexNCName();
347 if (t1.type == ERROR) return t1;
348
349 skipWS();
350
351 //If we're in an operator context, check for any operator names
352 if (isOperatorContext()) {
353 if (t1.value == QLatin1String("and")) //### hash?
354 return Token(AND);
355 if (t1.value == QLatin1String("or"))
356 return Token(OR);
357 if (t1.value == QLatin1String("mod"))
358 return Token(MULOP, NumericOp::OP_Mod);
359 if (t1.value == QLatin1String("div"))
360 return Token(MULOP, NumericOp::OP_Div);
361 }
362
363 //See whether we are at a :
364 if (peekCurHelper() == ':') {
365 m_nextPos++;
366 //Any chance it's an axis name?
367 if (peekCurHelper() == ':') {
368 m_nextPos++;
369
370 //It might be an axis name.
371 Step::AxisType axisType;
372 if (isAxisName(t1.value, &axisType))
373 return Token(AXISNAME, axisType);
374 //Ugh, :: is only valid in axis names -> error
375 return Token(ERROR);
376 }
377
378 //Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
379 skipWS();
380 if (peekCurHelper() == '*') {
381 m_nextPos++;
382 return Token(NAMETEST, t1.value + ":*");
383 }
384
385 //Make a full qname..
386 Token t2 = lexNCName();
387 if (t2.type == ERROR) return t2;
388
389 t1.value = t1.value + ':' + t2.value;
390 }
391
392 skipWS();
393 if (peekCurHelper() == '(') {
394 //note: we don't swallow the ( here!
395
396 //either node type of function name
397 if (isNodeTypeName(t1.value)) {
398 if (t1.value == "processing-instruction")
399 return Token(PI, t1.value);
400 else
401 return Token(NODETYPE, t1.value);
402 }
403 //must be a function name.
404 return Token(FUNCTIONNAME, t1.value);
405 }
406
407 //At this point, it must be NAMETEST
408 return Token(NAMETEST, t1.value);
409}
410
411Token Tokenizer::nextToken()
412{
413 Token toRet = nextTokenInternal();
414 m_lastTokenType = toRet.type;
415 return toRet;
416}
417
418Tokenizer::Tokenizer()
419{
420 reset(QString());
421}
422
423Tokenizer::~Tokenizer()
424{
425 delete s_axisNamesDict;
426 delete s_nodeTypeNamesDict;
427}
428
429void Tokenizer::reset(QString data)
430{
431 m_nextPos = 0;
432 m_data = data;
433 m_lastTokenType = 0;
434}
435
436int khtmlxpathyylex()
437{
438 Token tok = Tokenizer::self().nextToken();
439 if (tok.hasString) {
440 khtmlxpathyylval.str = new DOMString(tok.value);
441 } else if (tok.intValue) {
442 khtmlxpathyylval.num = tok.intValue;
443 }
444 return tok.type;
445}
446
447void initTokenizer(const DOM::DOMString& string)
448{
449 Tokenizer::self().reset(string.string());
450}
451
452} // namespace XPath
453} // namespace khtml
454
455// kate: indent-width 4; replace-tabs off; tab-width 4; indent-spaces: off;
DOM::DOMString
This class implements the basic string we use in the DOM.
Definition: dom_string.h:44
QHash
QSet
khtml::XPath::NumericOp::OP_Mul
@ OP_Mul
Definition: predicate.h:85
khtml::XPath::NumericOp::OP_Div
@ OP_Div
Definition: predicate.h:86
khtml::XPath::NumericOp::OP_Mod
@ OP_Mod
Definition: predicate.h:87
khtml::XPath::RelationOp::OP_GT
@ OP_GT
Definition: predicate.h:102
khtml::XPath::RelationOp::OP_LT
@ OP_LT
Definition: predicate.h:103
khtml::XPath::RelationOp::OP_EQ
@ OP_EQ
Definition: predicate.h:106
khtml::XPath::RelationOp::OP_NE
@ OP_NE
Definition: predicate.h:107
khtml::XPath::RelationOp::OP_LE
@ OP_LE
Definition: predicate.h:105
khtml::XPath::RelationOp::OP_GE
@ OP_GE
Definition: predicate.h:104
khtml::XPath::Step::AxisType
AxisType
Definition: step.h:45
khtml::XPath::Step::NamespaceAxis
@ NamespaceAxis
Definition: step.h:48
khtml::XPath::Step::PrecedingAxis
@ PrecedingAxis
Definition: step.h:49
khtml::XPath::Step::ParentAxis
@ ParentAxis
Definition: step.h:49
khtml::XPath::Step::ChildAxis
@ ChildAxis
Definition: step.h:47
khtml::XPath::Step::AttributeAxis
@ AttributeAxis
Definition: step.h:46
khtml::XPath::Step::PrecedingSiblingAxis
@ PrecedingSiblingAxis
Definition: step.h:49
khtml::XPath::Step::FollowingSiblingAxis
@ FollowingSiblingAxis
Definition: step.h:48
khtml::XPath::Step::DescendantAxis
@ DescendantAxis
Definition: step.h:47
khtml::XPath::Step::AncestorOrSelfAxis
@ AncestorOrSelfAxis
Definition: step.h:46
khtml::XPath::Step::DescendantOrSelfAxis
@ DescendantOrSelfAxis
Definition: step.h:47
khtml::XPath::Step::FollowingAxis
@ FollowingAxis
Definition: step.h:48
khtml::XPath::Step::SelfAxis
@ SelfAxis
Definition: step.h:50
khtml::XPath::Step::AncestorAxis
@ AncestorAxis
Definition: step.h:46
khtml::XPath::Tokenizer
Definition: tokenizer.h:57
khtml::XPath::Tokenizer::self
static Tokenizer & self()
Definition: tokenizer.cpp:77
khtml::XPath::Tokenizer::nextToken
Token nextToken()
Definition: tokenizer.cpp:411
khtml::XPath::Tokenizer::reset
void reset(QString)
Definition: tokenizer.cpp:429
dom3_xpath.h
DOM::XPath
Definition: dom3_xpath.h:56
DOM
This library provides a full-featured HTML parser and widget.
Definition: design.h:55
next
KAction * next(const QObject *recvr, const char *slot, QObject *parent)
name
const char * name(StandardAction id)
khtml::XPath
Definition: expression.h:45
khtml::XPath::axisNamesCount
static unsigned int axisNamesCount
Definition: tokenizer.cpp:64
khtml::XPath::khtmlxpathyylex
int khtmlxpathyylex()
Definition: tokenizer.cpp:436
khtml::XPath::axisNames
static AxisNameMapping axisNames[]
Definition: tokenizer.cpp:49
khtml::XPath::initTokenizer
void initTokenizer(const DOM::DOMString &string)
Definition: tokenizer.cpp:447
khtml::XPath::nodeTypeNames
static const char *const nodeTypeNames[]
Definition: tokenizer.cpp:66
khtml
LITERAL
@ LITERAL
Definition: parser.cpp:155
NAMETEST
@ NAMETEST
Definition: parser.cpp:160
NUMBER
@ NUMBER
Definition: parser.cpp:157
ERROR
@ ERROR
Definition: parser.cpp:161
NODETYPE
@ NODETYPE
Definition: parser.cpp:152
VARIABLEREFERENCE
@ VARIABLEREFERENCE
Definition: parser.cpp:156
SLASHSLASH
@ SLASHSLASH
Definition: parser.cpp:159
PI
@ PI
Definition: parser.cpp:153
RELOP
@ RELOP
Definition: parser.cpp:145
EQOP
@ EQOP
Definition: parser.cpp:144
PLUS
@ PLUS
Definition: parser.cpp:148
DOTDOT
@ DOTDOT
Definition: parser.cpp:158
FUNCTIONNAME
@ FUNCTIONNAME
Definition: parser.cpp:154
MULOP
@ MULOP
Definition: parser.cpp:146
AXISNAME
@ AXISNAME
Definition: parser.cpp:151
MINUS
@ MINUS
Definition: parser.cpp:147
khtmlxpathyylval
YYSTYPE khtmlxpathyylval
khtml::XPath::Token
Definition: tokenizer.h:44
khtml::XPath::Token::intValue
int intValue
Definition: tokenizer.h:47
khtml::XPath::Token::hasString
bool hasString
Definition: tokenizer.h:48
khtml::XPath::Token::value
QString value
Definition: tokenizer.h:46
khtml::XPath::Token::type
int type
Definition: tokenizer.h:45
tokenizer.h
AND
AND
OR
OR
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Mon Feb 20 2023 00:00:00 by doxygen 1.9.6 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

KHTML

Skip menu "KHTML"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Related Pages

kdelibs-4.14.38 API Reference

Skip menu "kdelibs-4.14.38 API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal