• Skip to content
  • Skip to link menu
  • KDE API Reference
  • kdelibs-4.14.38 API Reference
  • KDE Home
  • Contact Us
 

KHTML

  • khtml
khtml_filter.cpp
Go to the documentation of this file.
1/* This file is part of the KDE project
2
3 Copyright (C) 2005 Ivor Hewitt <ivor@kde.org>
4 Copyright (C) 2008 Maksim Orlovich <maksim@kde.org>
5 Copyright (C) 2008 Vyacheslav Tokarev <tsjoker@gmail.com>
6
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public
9 License as published by the Free Software Foundation; either
10 version 2 of the License, or (at your option) any later version.
11
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
16
17 You should have received a copy of the GNU Library General Public License
18 along with this library; see the file COPYING.LIB. If not, write to
19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA.
21*/
22
23#include "khtml_filter_p.h"
24#include <QDebug>
25
26// rolling hash parameters
27#define HASH_P (1997)
28#define HASH_Q (17509)
29// HASH_MOD = (HASH_P^7) % HASH_Q
30#define HASH_MOD (523)
31
32namespace khtml {
33
34// We only want a subset of features of wildcards -- just the
35// star, so we escape the rest before passing to QRegExp.
36// The \ is escaped due to a QRegExp bug.
37// ### we really should rather parse it ourselves, in order to
38// handle adblock-special things like | and ^ properly.
39static QRegExp fromAdBlockWildcard(const QString& wcStr) {
40 QRegExp rx;
41 rx.setPatternSyntax(QRegExp::Wildcard);
42
43 QString out;
44 for (int p = 0; p < wcStr.length(); ++p) {
45 QChar c = wcStr[p];
46 if (c == QLatin1Char('?'))
47 out += QLatin1String("[?]");
48 else if (c == QLatin1Char('['))
49 out += QLatin1String("[[]");
50 else if (c == QLatin1Char('\\'))
51 out += QLatin1String("[\\]");
52 else
53 out += c;
54 }
55
56 rx.setPattern(out);
57 return rx;
58}
59
60void FilterSet::addFilter(const QString& filterStr)
61{
62 QString filter = filterStr.trimmed();
63 if (filter.isEmpty()) {
64 return;
65 }
66
68 const QChar firstChar = filter.at(0);
69 if (firstChar == QLatin1Char('[') || firstChar == QLatin1Char('!') || firstChar == QLatin1Char('#') || filter.contains(QLatin1Char('#')))
70 return;
71
72 // Strip leading @@
73 if (filter.startsWith(QLatin1String("@@"))) {
74 filter.remove(0, 2);
75 }
76
77 // Strip options, we ignore them for now.
78 bool hadOptions = false;
79 int dollar = filter.lastIndexOf(QLatin1Char('$'));
80 if (dollar != -1) {
81 // Is it adblock's options delimiter or the special '$' char in a regular expression?
82 if (!filter.startsWith(QLatin1Char('/')) || !filter.endsWith(QLatin1Char('/'))) {
83 filter = filter.mid(0, dollar);
84 hadOptions = true;
85 }
86 }
87
88 // Is it a regexp filter?
89 if (filter.length() > 2 && filter.startsWith(QLatin1Char('/')) && filter.endsWith(QLatin1Char('/'))) {
90 // Ignore regexp that had options as we do not support them
91 if (hadOptions) {
92 return;
93 }
94 QString inside = filter.mid(1, filter.length()-2);
95 QRegExp rx(inside);
96 reFilters.append(rx);
97 //qDebug() << "R:" << inside;
98 return;
99 }
100 // Nope, a wildcard one.
101
102 // Disregard the rule if only one char is left or it contains unsupported adblock features ('|', "||", '^')
103 if (filter.length() < 2 || filter.contains(QLatin1Char('|')) || filter.contains(QLatin1Char('^'))) {
104 return;
105 }
106
107 // Strip wildcards at the ends
108 int first = 0;
109 int last = filter.length() - 1;
110
111 while (first < filter.length() && filter[first] == QLatin1Char('*'))
112 ++first;
113
114 while (last >= 0 && filter[last] == QLatin1Char('*'))
115 --last;
116
117 if (first > last)
118 filter = QLatin1String("*"); // erm... Well, they asked for it.
119 else
120 filter = filter.mid(first, last - first + 1);
121
122 // Now, do we still have any wildcard stuff left?
123 int aPos = filter.indexOf('*');
124 if (aPos != -1) {
125 // check if we can use RK first (and then check full RE for the rest) for better performance
126 if (aPos > 7) {
127 QRegExp rx = fromAdBlockWildcard(filter.mid(aPos) + QLatin1Char('*'));
128 // We pad the final r.e. with * so we can check for an exact match
129 stringFiltersMatcher.addWildedString(filter.mid(0, aPos), rx);
130 } else {
131 QRegExp rx = fromAdBlockWildcard(filter);
132 reFilters.append(rx);
133 }
134 } else {
135 // Fast path
136 stringFiltersMatcher.addString(filter);
137 }
138
139}
140
141bool FilterSet::isUrlMatched(const QString& url)
142{
143 if (stringFiltersMatcher.isMatched(url))
144 return true;
145
146 for (int c = 0; c < reFilters.size(); ++c)
147 {
148 if (url.contains(reFilters[c]))
149 return true;
150 }
151
152 return false;
153}
154
155QString FilterSet::urlMatchedBy(const QString& url)
156{
157 QString by;
158
159 if (stringFiltersMatcher.isMatched(url, &by))
160 return by;
161
162 for (int c = 0; c < reFilters.size(); ++c)
163 {
164 if (url.contains(reFilters[c]))
165 {
166 by = reFilters[c].pattern();
167 break;
168 }
169 }
170
171 return by;
172}
173
174void FilterSet::clear()
175{
176 reFilters.clear();
177 stringFiltersMatcher.clear();
178}
179
180
181void StringsMatcher::addString(const QString& pattern)
182{
183 if (pattern.length() < 8) {
184 // handle short string differently
185 shortStringFilters.append(pattern);
186 } else {
187 // use modified Rabin-Karp's algorithm with 8-length string hash
188 // i.e. store hash of first 8 chars in the HashMap for fast look-up
189 stringFilters.append(pattern);
190 int ind = stringFilters.size() - 1;
191 int current = 0;
192
193 // compute hash using rolling hash
194 // hash for string: x0,x1,x2...xn-1 will be:
195 // (p^(n-1)*x0 + p^(n-2)*x1 + ... + p * xn-2 + xn-1) % q
196 // where p and q some wisely-chosen integers
197 /*for (int k = 0; k < 8; ++k)*/
198 int len = pattern.length();
199 for (int k = len - 8; k < len; ++k)
200 current = (current * HASH_P + pattern[k].unicode()) % HASH_Q;
201
202 // insert computed hash value into HashMap
203 WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
204 if (it == stringFiltersHash.end()) {
205 QVector<int> list;
206 list.append(ind);
207 stringFiltersHash.add(current + 1, list);
208 fastLookUp.setBit(current);
209 } else {
210 it->second.append(ind);
211 }
212 }
213}
214
215void StringsMatcher::addWildedString(const QString& prefix, const QRegExp& rx)
216{
217 rePrefixes.append(prefix);
218 reFilters.append(rx);
219 int index = -rePrefixes.size();
220
221 int current = 0;
222 for (int k = 0; k < 8; ++k)
223 current = (current * HASH_P + prefix[k].unicode()) % HASH_Q;
224
225 // insert computed hash value into HashMap
226 WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
227 if (it == stringFiltersHash.end()) {
228 QVector<int> list;
229 list.append(index);
230 stringFiltersHash.add(current + 1, list);
231 fastLookUp.setBit(current);
232 } else {
233 it->second.append(index);
234 }
235}
236
237bool StringsMatcher::isMatched(const QString& str, QString *by) const
238{
239 // check short strings first
240 for (int i = 0; i < shortStringFilters.size(); ++i) {
241 if (str.contains(shortStringFilters[i]))
242 {
243 if (by != 0) *by = shortStringFilters[i];
244 return true;
245 }
246 }
247
248 int len = str.length();
249 int k;
250
251 int current = 0;
252 int next = 0;
253 // compute hash for first 8 characters
254 for (k = 0; k < 8 && k < len; ++k)
255 current = (current * HASH_P + str[k].unicode()) % HASH_Q;
256
257 WTF::HashMap<int, QVector<int> >::const_iterator hashEnd = stringFiltersHash.end();
258 // main Rabin-Karp's algorithm loop
259 for (k = 7; k < len; ++k, current = next) {
260 // roll the hash if not at the end
261 // (calculate hash for the next iteration)
262 if (k + 1 < len)
263 next = (HASH_P * ((current + HASH_Q - ((HASH_MOD * str[k - 7].unicode()) % HASH_Q)) % HASH_Q) + str[k + 1].unicode()) % HASH_Q;
264
265 if (!fastLookUp.testBit(current))
266 continue;
267
268 // look-up the hash in the HashMap and check all strings
269 WTF::HashMap<int, QVector<int> >::const_iterator it = stringFiltersHash.find(current + 1);
270
271 // check possible strings
272 if (it != hashEnd) {
273 for (int j = 0; j < it->second.size(); ++j) {
274 int index = it->second[j];
275 // check if we got simple string or REs prefix
276 if (index >= 0) {
277 int flen = stringFilters[index].length();
278 if (k - flen + 1 >= 0 && stringFilters[index] == str.midRef(k - flen + 1 , flen))
279 {
280 if (by != 0) *by = stringFilters[index];
281 return true;
282 }
283 } else {
284 index = -index - 1;
285 int flen = rePrefixes[index].length();
286 if (k - 8 + flen < len && rePrefixes[index] == str.midRef(k - 7, flen))
287 {
288 int remStart = k - 7 + flen;
289 QString remainder = QString::fromRawData(str.unicode() + remStart,
290 str.length() - remStart);
291 if (reFilters[index].exactMatch(remainder)) {
292 if (by != 0) *by = rePrefixes[index]+reFilters[index].pattern();
293 return true;
294 }
295 }
296 }
297 }
298 }
299 }
300
301 return false;
302}
303
304void StringsMatcher::clear()
305{
306 stringFilters.clear();
307 shortStringFilters.clear();
308 reFilters.clear();
309 rePrefixes.clear();
310 stringFiltersHash.clear();
311 fastLookUp.resize(HASH_Q);
312 fastLookUp.fill(0, 0, HASH_Q);
313}
314
315}
316
317// kate: indent-width 4; replace-tabs on; tab-width 4; space-indent on;
QVector
Definition: khtmlview.h:36
khtml::FilterSet::addFilter
void addFilter(const QString &filter)
Definition: khtml_filter.cpp:60
khtml::FilterSet::clear
void clear()
Definition: khtml_filter.cpp:174
khtml::FilterSet::urlMatchedBy
QString urlMatchedBy(const QString &url)
Definition: khtml_filter.cpp:155
khtml::FilterSet::isUrlMatched
bool isUrlMatched(const QString &url)
Definition: khtml_filter.cpp:141
khtml::StringsMatcher::clear
void clear()
Definition: khtml_filter.cpp:304
khtml::StringsMatcher::isMatched
bool isMatched(const QString &str, QString *by=0) const
Definition: khtml_filter.cpp:237
khtml::StringsMatcher::addWildedString
void addWildedString(const QString &prefix, const QRegExp &rx)
Definition: khtml_filter.cpp:215
khtml::StringsMatcher::addString
void addString(const QString &pattern)
Definition: khtml_filter.cpp:181
HASH_Q
#define HASH_Q
Definition: khtml_filter.cpp:28
HASH_MOD
#define HASH_MOD
Definition: khtml_filter.cpp:30
HASH_P
#define HASH_P
Definition: khtml_filter.cpp:27
khtml_filter_p.h
prefix
QString prefix()
pattern
QString pattern(Mode mode=Reading)
list
QStringList list(const QString &fileClass)
next
KAction * next(const QObject *recvr, const char *slot, QObject *parent)
khtml
khtml::ind
static QDebug & ind(const QDebug &dbgIn, int ind)
Definition: khtml_childframe.cpp:60
khtml::fromAdBlockWildcard
static QRegExp fromAdBlockWildcard(const QString &wcStr)
Definition: khtml_filter.cpp:39
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Mon Feb 20 2023 00:00:00 by doxygen 1.9.6 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

KHTML

Skip menu "KHTML"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Related Pages

kdelibs-4.14.38 API Reference

Skip menu "kdelibs-4.14.38 API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal