3 This file is part of the Public Domain C Library (PDCLib).
4 Permission is granted to use, modify, and / or redistribute at will.
12 #include <_PDCLIB_encoding.h>
14 /* Use of the mbstate:
16 * _StUC[0] is the current decoding state
17 * _St32[1] is the character accumulated so far
33 #define state (p_s->_StUC[0])
34 #define accum (p_s->_St32[1])
36 #define START_CONVERSION \
39 #define END_CONVERSION \
43 #define FINISH(_r) do { \
45 goto end_conversion; \
48 #define OUT32(_c) do { \
50 (*((*p_outbuf)++)) = (_c); \
52 _PDCLIB_UNDEFINED(accum); \
55 #define CHECK_CONTINUATION \
56 do { if((c & 0xC0) != 0x80) return false; } while(0)
58 static bool utf8toc32(
59 char32_t *restrict *restrict p_outbuf,
60 size_t *restrict p_outsz,
61 const char *restrict *restrict p_inbuf,
62 size_t *restrict p_insz,
63 mbstate_t *restrict p_s
67 while(*p_outsz && *p_insz) {
68 unsigned char c = **p_inbuf;
75 } else if(c <= 0xDF) {
76 accum = (c & 0x1F) << 6;
78 } else if(c <= 0xEF) {
79 accum = (c & 0x0F) << 12;
81 } else if(c <= 0xF4) {
82 accum = (c & 0x07) << 18;
85 // 5+byte sequence illegal
93 c32 = accum | (c & 0x3F);
95 // Overlong sequence (e.g. NUL injection)
104 accum |= (c & 0x3F) << 6;
111 c32 = accum | (c & 0x3F);
118 if(c32 >= 0xD800 && c32 <= 0xDFFF)
126 accum |= (c & 0x3F) << 12;
132 accum |= (c & 0x3F) << 6;
139 c32 = accum | (c & 0x3F);
142 if(c32 <= 0xFFFF) FINISH(false);
145 if(c32 > 0x10FFFF) FINISH(false);
151 assert(!"Invalid state");
167 static bool c32toutf8(
168 char *restrict *restrict p_outbuf,
169 size_t *restrict p_outsz,
170 const char32_t *restrict *restrict p_inbuf,
171 size_t *restrict p_insz,
172 mbstate_t *restrict p_s
180 outc = 0x80 | ((accum >> 12) & 0x3F);
185 outc = 0x80 | ((accum >> 6) & 0x3F);
190 outc = 0x80 | (accum & 0x3F);
192 _PDCLIB_UNDEFINED(accum);
206 _PDCLIB_UNDEFINED(accum);
207 } else if(accum <= 0x7FF) {
208 outc = 0xC0 | (accum >> 6);
210 } else if(accum <= 0xFFFF) {
211 outc = 0xE0 | (accum >> 12);
213 } else if(accum <= 0x10FFFF) {
214 outc = 0xF0 | (accum >> 18);
231 struct _PDCLIB_charcodec _PDCLIB_utf8_codec = {
232 .__mbstoc32s = utf8toc32,
233 .__c32stombs = c32toutf8,
240 #include <_PDCLIB_test.h>
245 // Valid conversion & back
247 static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF"
252 char32_t *c32ptr = &c32out[0];
254 char *chrptr = (char*) &input[0];
255 size_t chrrem = strlen(input);
256 mbstate_t mbs = { 0 };
258 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
259 TESTCASE(c32rem == 0);
260 TESTCASE(chrrem == 0);
261 TESTCASE(c32ptr == &c32out[8]);
262 TESTCASE(chrptr == &input[strlen(input)]);
263 TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
264 c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
265 c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
267 char chrout[strlen(input)];
271 chrrem = strlen(input);
272 TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
273 TESTCASE(c32rem == 0);
274 TESTCASE(chrrem == 0);
275 TESTCASE(c32ptr == &c32out[8]);
276 TESTCASE(chrptr == &chrout[strlen(input)]);
277 TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
279 // Multi-part conversion
280 static const char* mpinput = "\xDF\xBF";
283 chrptr = &mpinput[0];
285 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
286 TESTCASE(c32ptr == &c32out[0]);
287 TESTCASE(c32rem == 8);
288 TESTCASE(chrptr == &mpinput[1]);
289 TESTCASE(chrrem == 0);
291 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
292 TESTCASE(c32ptr == &c32out[1]);
293 TESTCASE(c32rem == 7);
294 TESTCASE(chrptr == &mpinput[2]);
295 TESTCASE(chrrem == 0);
297 // Invalid conversions
300 const char* nul2 = "\xC0\x80";
305 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
306 memset(&mbs, 0, sizeof mbs);
307 const char* nul3 = "\xE0\x80\x80";
312 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
313 memset(&mbs, 0, sizeof mbs);
314 const char* nul4 = "\xF0\x80\x80\x80";
319 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
321 // Starting on a continuation
322 const char* cont = "\x80";
327 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);