Hubbub
|
#include <assert.h>
#include <stdbool.h>
#include <string.h>
#include <parserutils/charset/mibenum.h>
#include <hubbub/types.h>
#include "utils/utils.h"
#include "detect.h"
Go to the source code of this file.
Macros | |
#define | PEEK(a) |
#define | ADVANCE(a) |
#define | ISSPACE(a) |
Functions | |
static uint16_t | hubbub_charset_read_bom (const uint8_t *data, size_t len) |
Inspect the beginning of a buffer of data for the presence of a UTF Byte Order Mark. More... | |
static uint16_t | hubbub_charset_scan_meta (const uint8_t *data, size_t len) |
Search for a meta charset within a buffer of data. More... | |
static uint16_t | hubbub_charset_parse_attributes (const uint8_t **pos, const uint8_t *end) |
Parse attributes on a meta tag. More... | |
static bool | hubbub_charset_get_attribute (const uint8_t **data, const uint8_t *end, const uint8_t **name, uint32_t *namelen, const uint8_t **value, uint32_t *valuelen) |
Extract an attribute from the data stream. More... | |
parserutils_error | hubbub_charset_extract (const uint8_t *data, size_t len, uint16_t *mibenum, uint32_t *source) |
Extract a charset from a chunk of data. More... | |
uint16_t | hubbub_charset_parse_content (const uint8_t *value, uint32_t valuelen) |
Parse a content= attribute's value. More... | |
void | hubbub_charset_fix_charset (uint16_t *charset) |
Fix charsets, according to the override table in HTML5, section 8.2.2.2. More... | |
#define ADVANCE | ( | a | ) |
#define ISSPACE | ( | a | ) |
Definition at line 198 of file detect.c.
Referenced by hubbub_charset_get_attribute(), hubbub_charset_parse_attributes(), hubbub_charset_parse_content(), and hubbub_charset_scan_meta().
#define PEEK | ( | a | ) |
Definition at line 184 of file detect.c.
Referenced by hubbub_charset_scan_meta().
parserutils_error hubbub_charset_extract | ( | const uint8_t * | data, |
size_t | len, | ||
uint16_t * | mibenum, | ||
uint32_t * | source | ||
) |
Extract a charset from a chunk of data.
data | Pointer to buffer containing data |
len | Buffer length |
mibenum | Pointer to location containing current MIB enum |
source | Pointer to location containint current charset source |
::mibenum and ::source will be updated on exit
The larger a chunk of data fed to this routine, the better, as it allows charset autodetection access to a larger dataset for analysis.
Meaning of *source on entry:
CONFIDENT - Do not pass Go, do not attempt auto-detection. TENTATIVE - We've tried to autodetect already, but subsequently discovered that we don't actually support the detected charset. Thus, we've defaulted to Windows-1252. Don't perform auto-detection again, as it would be futile. (This bit diverges from the spec) UNKNOWN - No autodetection performed yet. Get on with it.
Definition at line 43 of file detect.c.
References HUBBUB_CHARSET_CONFIDENT, hubbub_charset_fix_charset(), hubbub_charset_read_bom(), hubbub_charset_scan_meta(), HUBBUB_CHARSET_TENTATIVE, len, and SLEN.
Referenced by hubbub_parser_create().
void hubbub_charset_fix_charset | ( | uint16_t * | charset | ) |
Fix charsets, according to the override table in HTML5, section 8.2.2.2.
Character encoding requirements http://www.whatwg.org/specs/web-apps/current-work/#character0
charset | Pointer to charset value to fix |
Definition at line 666 of file detect.c.
References SLEN.
Referenced by hubbub_charset_extract(), hubbub_parser_create(), and process_meta_in_head().
|
static |
Extract an attribute from the data stream.
data | Pointer to pointer to current location (updated on exit) |
end | Pointer to end of data stream |
name | Pointer to location to receive attribute name |
namelen | Pointer to location to receive attribute name length |
value | Pointer to location to receive attribute value |
valuelen | Pointer to location to receive attribute value langth |
Note: The caller should heed the returned lengths; these are the only indicator that useful content resides in name or value.
Definition at line 486 of file detect.c.
Referenced by hubbub_charset_parse_attributes(), and hubbub_charset_scan_meta().
|
static |
Parse attributes on a meta tag.
pos | Pointer to pointer to current location (updated on exit) |
end | Pointer to end of data stream |
Definition at line 299 of file detect.c.
References hubbub_charset_get_attribute(), hubbub_charset_parse_content(), ISSPACE, name, and SLEN.
Referenced by hubbub_charset_scan_meta().
uint16_t hubbub_charset_parse_content | ( | const uint8_t * | value, |
uint32_t | valuelen | ||
) |
Parse a content= attribute's value.
value | Attribute's value |
valuelen | Length of value |
Definition at line 368 of file detect.c.
Referenced by hubbub_charset_parse_attributes(), and process_meta_in_head().
|
static |
Inspect the beginning of a buffer of data for the presence of a UTF Byte Order Mark.
data | Pointer to buffer containing data |
len | Buffer length |
Definition at line 161 of file detect.c.
Referenced by hubbub_charset_extract().
|
static |
Search for a meta charset within a buffer of data.
data | Pointer to buffer containing data |
len | Length of buffer |
Definition at line 209 of file detect.c.
References ADVANCE, hubbub_charset_get_attribute(), hubbub_charset_parse_attributes(), ISSPACE, len, min, PEEK, and SLEN.
Referenced by hubbub_charset_extract().