Libparserutils
utf16.c
Go to the documentation of this file.
1/*
2 * This file is part of LibParserUtils.
3 * Licensed under the MIT License,
4 * http://www.opensource.org/licenses/mit-license.php
5 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6 */
7
11
12#include <stdbool.h>
13#include <stdlib.h>
14#include <string.h>
15
17
28 size_t len, uint32_t *ucs4, size_t *clen)
29{
30 const uint16_t *ss = (const uint16_t *) (const void *) s;
31
32 if (s == NULL || ucs4 == NULL || clen == NULL)
34
35 if (len < 2)
37
38 if (*ss < 0xD800 || *ss > 0xDFFF) {
39 *ucs4 = *ss;
40 *clen = 2;
41 } else if (0xD800 <= *ss && *ss <= 0xDBFF) {
42 /* High-surrogate code unit. */
43 if (len < 4)
45
46 if (0xDC00 <= ss[1] && ss[1] <= 0xDFFF) {
47 /* We have a valid surrogate pair. */
48 *ucs4 = (((ss[0] & 0x3FF) << 10) | (ss[1] & 0x3FF))
49 + (1<<16);
50 *clen = 4;
51 } else {
53 }
54 } else {
55 /* Low-surrogate code unit. */
57 }
58
59 return PARSERUTILS_OK;
60}
61
71 size_t *len)
72{
73 uint16_t *ss = (uint16_t *) (void *) s;
74 uint32_t l = 0;
75
76 if (s == NULL || len == NULL)
78 else if (ucs4 < 0x10000) {
79 *ss = (uint16_t) ucs4;
80 l = 2;
81 } else if (ucs4 < 0x110000) {
82 ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
83 ss[1] = 0xDC00 | (ucs4 & 0x3ff);
84 l = 4;
85 } else {
87 }
88
89 *len = l;
90
91 return PARSERUTILS_OK;
92}
93
103 size_t *len)
104{
105 const uint16_t *ss = (const uint16_t *) (const void *) s;
106 const uint16_t *end = (const uint16_t *) (const void *) (s + max);
107 int l = 0;
108
109 if (s == NULL || len == NULL)
110 return PARSERUTILS_BADPARM;
111
112 while (ss < end) {
113 if (*ss < 0xD800 || 0xDFFF < *ss)
114 ss++;
115 else
116 ss += 2;
117
118 l++;
119 }
120
121 *len = l;
122
123 return PARSERUTILS_OK;
124}
125
134 size_t *len)
135{
136 const uint16_t *ss = (const uint16_t *) (const void *) s;
137
138 if (s == NULL || len == NULL)
139 return PARSERUTILS_BADPARM;
140
141 if (*ss < 0xD800 || 0xDFFF < *ss)
142 *len = 2;
143 else
144 *len = 4;
145
146 return PARSERUTILS_OK;
147}
148
159 uint32_t *prevoff)
160{
161 const uint16_t *ss = (const uint16_t *) (const void *) s;
162
163 if (s == NULL || prevoff == NULL)
164 return PARSERUTILS_BADPARM;
165
166 if (off < 2)
167 *prevoff = 0;
168 else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
169 *prevoff = off - 2;
170 else
171 *prevoff = (off < 4) ? 0 : off - 4;
172
173 return PARSERUTILS_OK;
174}
175
187 uint32_t off, uint32_t *nextoff)
188{
189 const uint16_t *ss = (const uint16_t *) (const void *) s;
190
191 if (s == NULL || off >= len || nextoff == NULL)
192 return PARSERUTILS_BADPARM;
193
194 if (len - off < 4)
195 *nextoff = len;
196 else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
197 *nextoff = off + 2;
198 else
199 *nextoff = (len - off < 6) ? len : off + 4;
200
201 return PARSERUTILS_OK;
202}
203
215 uint32_t len, uint32_t off, uint32_t *nextoff)
216{
217 const uint16_t *ss = (const uint16_t *) (const void *) s;
218
219 if (s == NULL || off >= len || nextoff == NULL)
220 return PARSERUTILS_BADPARM;
221
222 while (1) {
223 if (len - off < 4) {
225 } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
226 *nextoff = off + 2;
227 break;
228 } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
229 if (len - off < 6)
231
232 if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
233 *nextoff = off + 4;
234 break;
235 } else {
236 ss++;
237 off += 2;
238 }
239 }
240 }
241
242 return PARSERUTILS_OK;
243}
244
size_t len
Definition codec_8859.c:23
parserutils_error
Definition errors.h:18
@ PARSERUTILS_OK
Definition errors.h:19
@ PARSERUTILS_NEEDDATA
Definition errors.h:25
@ PARSERUTILS_INVALID
Definition errors.h:23
@ PARSERUTILS_BADPARM
Definition errors.h:22
parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max, size_t *len)
Calculate the length (in characters) of a bounded UTF-16 string.
Definition utf16.c:102
parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, size_t len, uint32_t *ucs4, size_t *clen)
Convert a UTF-16 sequence into a single UCS-4 character.
Definition utf16.c:27
parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s, uint32_t len, uint32_t off, uint32_t *nextoff)
Find next legal UTF-16 char in string.
Definition utf16.c:214
parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s, size_t *len)
Convert a single UCS-4 character into a UTF-16 sequence.
Definition utf16.c:70
parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off, uint32_t *prevoff)
Find previous legal UTF-16 char in string.
Definition utf16.c:158
parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len, uint32_t off, uint32_t *nextoff)
Find next legal UTF-16 char in string.
Definition utf16.c:186
parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s, size_t *len)
Calculate the length (in bytes) of a UTF-16 character.
Definition utf16.c:133
UTF-16 manipulation functions (interface).
#define max(a, b)
Definition utils.h:12