3 This file is part of the Public Domain C Library (PDCLib).
4 Permission is granted to use, modify, and / or redistribute at will.
13 /* Use of the mbstate:
15 * _StUC[0] is the current decoding state
16 * _St32[1] is the character accumulated so far
32 #define state (p_s->_StUC[0])
33 #define accum (p_s->_St32[1])
35 #define START_CONVERSION \
38 #define END_CONVERSION \
42 #define FINISH(_r) do { \
44 goto end_conversion; \
47 #define OUT32(_c) do { \
48 (*((*p_outbuf)++)) = (_c); \
50 _PDCLIB_UNDEFINED(accum); \
53 #define CHECK_CONTINUATION \
54 do { if((c & 0xC0) != 0x80) return false; } while(0)
56 static bool utf8toc32(
57 char32_t **restrict p_outbuf,
58 size_t *restrict p_outsz,
59 const char **restrict p_inbuf,
60 size_t *restrict p_insz,
61 mbstate_t *restrict p_s
65 while(*p_outsz && *p_insz) {
66 unsigned char c = **p_inbuf;
73 } else if(c <= 0xDF) {
74 accum = (c & 0x1F) << 6;
76 } else if(c <= 0xEF) {
77 accum = (c & 0x0F) << 12;
79 } else if(c <= 0xF4) {
80 accum = (c & 0x07) << 18;
83 // 5+byte sequence illegal
91 c32 = accum | (c & 0x3F);
93 // Overlong sequence (e.g. NUL injection)
102 accum |= (c & 0x3F) << 6;
109 c32 = accum | (c & 0x3F);
116 if(c32 >= 0xD800 && c32 <= 0xDFFF)
124 accum |= (c & 0x3F) << 12;
130 accum |= (c & 0x3F) << 6;
137 c32 = accum | (c & 0x3F);
140 if(c32 <= 0xFFFF) FINISH(false);
143 if(c32 > 0x10FFFF) FINISH(false);
149 assert(!"Invalid state");
165 static bool c32toutf8(
166 char **restrict p_outbuf,
167 size_t *restrict p_outsz,
168 const char32_t **restrict p_inbuf,
169 size_t *restrict p_insz,
170 mbstate_t *restrict p_s
175 char *c8 = *p_outbuf;
178 *c8 = 0x80 | ((accum >> 12) & 0x3F);
183 *c8 = 0x80 | ((accum >> 6) & 0x3F);
188 *c8 = 0x80 | (accum & 0x3F);
190 _PDCLIB_UNDEFINED(accum);
204 _PDCLIB_UNDEFINED(accum);
205 } else if(accum <= 0x7FF) {
206 *c8 = 0xC0 | (accum >> 6);
208 } else if(accum <= 0xFFFF) {
209 *c8 = 0xE0 | (accum >> 12);
211 } else if(accum <= 0x10FFFF) {
212 *c8 = 0xF0 | (accum >> 18);
229 #include <_PDCLIB_test.h>
234 // Valid conversion & back
236 static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF"
241 char32_t *c32ptr = &c32out[0];
243 char *chrptr = (char*) &input[0];
244 size_t chrrem = strlen(input);
245 mbstate_t mbs = { 0 };
247 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
248 TESTCASE(c32rem == 0);
249 TESTCASE(chrrem == 0);
250 TESTCASE(c32ptr == &c32out[8]);
251 TESTCASE(chrptr == &input[strlen(input)]);
252 TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
253 c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
254 c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
256 char chrout[strlen(input)];
260 chrrem = strlen(input);
261 TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
262 TESTCASE(c32rem == 0);
263 TESTCASE(chrrem == 0);
264 TESTCASE(c32ptr == &c32out[8]);
265 TESTCASE(chrptr == &chrout[strlen(input)]);
266 TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
268 // Multi-part conversion
269 static const char* mpinput = "\xDF\xBF";
272 chrptr = &mpinput[0];
274 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
275 TESTCASE(c32ptr == &c32out[0]);
276 TESTCASE(c32rem == 8);
277 TESTCASE(chrptr == &mpinput[1]);
278 TESTCASE(chrrem == 0);
280 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
281 TESTCASE(c32ptr == &c32out[1]);
282 TESTCASE(c32rem == 7);
283 TESTCASE(chrptr == &mpinput[2]);
284 TESTCASE(chrrem == 0);
286 // Invalid conversions
289 const char* nul2 = "\xC0\x80";
294 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
295 memset(&mbs, 0, sizeof mbs);
296 const char* nul3 = "\xE0\x80\x80";
301 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
302 memset(&mbs, 0, sizeof mbs);
303 const char* nul4 = "\xF0\x80\x80\x80";
308 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
310 // Starting on a continuation
311 const char* cont = "\x80";
316 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);