3 This file is part of the Public Domain C Library (PDCLib).
4 Permission is granted to use, modify, and / or redistribute at will.
13 /* Use of the mbstate:
15 * _StUC[0] is the current decoding state
16 * _St32[1] is the character accumulated so far
32 #define state (p_s->_StUC[0])
33 #define accum (p_s->_St32[1])
35 #define START_CONVERSION \
38 #define END_CONVERSION \
42 #define FINISH(_r) do { \
44 goto end_conversion; \
47 #define OUT32(_c) do { \
49 (*((*p_outbuf)++)) = (_c); \
51 _PDCLIB_UNDEFINED(accum); \
54 #define CHECK_CONTINUATION \
55 do { if((c & 0xC0) != 0x80) return false; } while(0)
57 static bool utf8toc32(
58 char32_t **restrict p_outbuf,
59 size_t *restrict p_outsz,
60 const char **restrict p_inbuf,
61 size_t *restrict p_insz,
62 mbstate_t *restrict p_s
66 while(*p_outsz && *p_insz) {
67 unsigned char c = **p_inbuf;
74 } else if(c <= 0xDF) {
75 accum = (c & 0x1F) << 6;
77 } else if(c <= 0xEF) {
78 accum = (c & 0x0F) << 12;
80 } else if(c <= 0xF4) {
81 accum = (c & 0x07) << 18;
84 // 5+byte sequence illegal
92 c32 = accum | (c & 0x3F);
94 // Overlong sequence (e.g. NUL injection)
103 accum |= (c & 0x3F) << 6;
110 c32 = accum | (c & 0x3F);
117 if(c32 >= 0xD800 && c32 <= 0xDFFF)
125 accum |= (c & 0x3F) << 12;
131 accum |= (c & 0x3F) << 6;
138 c32 = accum | (c & 0x3F);
141 if(c32 <= 0xFFFF) FINISH(false);
144 if(c32 > 0x10FFFF) FINISH(false);
150 assert(!"Invalid state");
166 static bool c32toutf8(
167 char **restrict p_outbuf,
168 size_t *restrict p_outsz,
169 const char32_t **restrict p_inbuf,
170 size_t *restrict p_insz,
171 mbstate_t *restrict p_s
179 outc = 0x80 | ((accum >> 12) & 0x3F);
184 outc = 0x80 | ((accum >> 6) & 0x3F);
189 outc = 0x80 | (accum & 0x3F);
191 _PDCLIB_UNDEFINED(accum);
205 _PDCLIB_UNDEFINED(accum);
206 } else if(accum <= 0x7FF) {
207 outc = 0xC0 | (accum >> 6);
209 } else if(accum <= 0xFFFF) {
210 outc = 0xE0 | (accum >> 12);
212 } else if(accum <= 0x10FFFF) {
213 outc = 0xF0 | (accum >> 18);
230 _PDCLIB_charcodec _PDCLIB_utf8_codec = {
231 .__mbstoc32s = utf8toc32,
232 .__c32stombs = c32toutf8,
238 #include <_PDCLIB_test.h>
243 // Valid conversion & back
245 static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF"
250 char32_t *c32ptr = &c32out[0];
252 char *chrptr = (char*) &input[0];
253 size_t chrrem = strlen(input);
254 mbstate_t mbs = { 0 };
256 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
257 TESTCASE(c32rem == 0);
258 TESTCASE(chrrem == 0);
259 TESTCASE(c32ptr == &c32out[8]);
260 TESTCASE(chrptr == &input[strlen(input)]);
261 TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
262 c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
263 c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
265 char chrout[strlen(input)];
269 chrrem = strlen(input);
270 TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
271 TESTCASE(c32rem == 0);
272 TESTCASE(chrrem == 0);
273 TESTCASE(c32ptr == &c32out[8]);
274 TESTCASE(chrptr == &chrout[strlen(input)]);
275 TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
277 // Multi-part conversion
278 static const char* mpinput = "\xDF\xBF";
281 chrptr = &mpinput[0];
283 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
284 TESTCASE(c32ptr == &c32out[0]);
285 TESTCASE(c32rem == 8);
286 TESTCASE(chrptr == &mpinput[1]);
287 TESTCASE(chrrem == 0);
289 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
290 TESTCASE(c32ptr == &c32out[1]);
291 TESTCASE(c32rem == 7);
292 TESTCASE(chrptr == &mpinput[2]);
293 TESTCASE(chrrem == 0);
295 // Invalid conversions
298 const char* nul2 = "\xC0\x80";
303 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
304 memset(&mbs, 0, sizeof mbs);
305 const char* nul3 = "\xE0\x80\x80";
310 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
311 memset(&mbs, 0, sizeof mbs);
312 const char* nul4 = "\xF0\x80\x80\x80";
317 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
319 // Starting on a continuation
320 const char* cont = "\x80";
325 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);