3 This file is part of the Public Domain C Library (PDCLib).
4 Permission is granted to use, modify, and / or redistribute at will.
12 #include <_PDCLIB_encoding.h>
14 /* Use of the mbstate:
16 * _StUC[0] is the current decoding state
17 * _St32[1] is the character accumulated so far
20 static bool utf8_mbsinit( const mbstate_t *p_s )
21 { return p_s->_StUC[0] == 0; }
36 #define state (p_s->_StUC[0])
37 #define accum (p_s->_St32[1])
39 #define START_CONVERSION \
42 #define END_CONVERSION \
46 #define FINISH(_r) do { \
48 goto end_conversion; \
51 #define OUT32(_c) do { \
53 (*((*p_outbuf)++)) = (_c); \
55 _PDCLIB_UNDEFINED(accum); \
59 #define CHECK_CONTINUATION \
60 do { if((c & 0xC0) != 0x80) return false; } while(0)
62 static bool utf8toc32(
63 char32_t *restrict *restrict p_outbuf,
64 size_t *restrict p_outsz,
65 const char *restrict *restrict p_inbuf,
66 size_t *restrict p_insz,
67 mbstate_t *restrict p_s
71 while(*p_outsz && *p_insz) {
72 unsigned char c = **p_inbuf;
79 } else if(c <= 0xDF) {
80 accum = (c & 0x1F) << 6;
82 } else if(c <= 0xEF) {
83 accum = (c & 0x0F) << 12;
85 } else if(c <= 0xF4) {
86 accum = (c & 0x07) << 18;
89 // 5+byte sequence illegal
97 c32 = accum | (c & 0x3F);
99 // Overlong sequence (e.g. NUL injection)
108 accum |= (c & 0x3F) << 6;
115 c32 = accum | (c & 0x3F);
122 if(c32 >= 0xD800 && c32 <= 0xDFFF)
130 accum |= (c & 0x3F) << 12;
136 accum |= (c & 0x3F) << 6;
143 c32 = accum | (c & 0x3F);
146 if(c32 <= 0xFFFF) FINISH(false);
149 if(c32 > 0x10FFFF) FINISH(false);
155 assert(!"Invalid state");
171 static bool c32toutf8(
172 char *restrict *restrict p_outbuf,
173 size_t *restrict p_outsz,
174 const char32_t *restrict *restrict p_inbuf,
175 size_t *restrict p_insz,
176 mbstate_t *restrict p_s
184 outc = 0x80 | ((accum >> 12) & 0x3F);
189 outc = 0x80 | ((accum >> 6) & 0x3F);
194 outc = 0x80 | (accum & 0x3F);
196 _PDCLIB_UNDEFINED(accum);
210 _PDCLIB_UNDEFINED(accum);
211 } else if(accum <= 0x7FF) {
212 outc = 0xC0 | (accum >> 6);
214 } else if(accum <= 0xFFFF) {
215 outc = 0xE0 | (accum >> 12);
217 } else if(accum <= 0x10FFFF) {
218 outc = 0xF0 | (accum >> 18);
235 const struct _PDCLIB_charcodec _PDCLIB_utf8_codec = {
236 .__mbsinit = utf8_mbsinit,
237 .__mbstoc32s = utf8toc32,
238 .__c32stombs = c32toutf8,
245 #include <_PDCLIB_test.h>
250 // Valid conversion & back
252 static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF"
257 char32_t *c32ptr = &c32out[0];
259 const char *chrptr = (char*) &input[0];
260 size_t chrrem = strlen(input);
261 mbstate_t mbs = { 0 };
263 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
264 TESTCASE(c32rem == 0);
265 TESTCASE(chrrem == 0);
266 TESTCASE(c32ptr == &c32out[8]);
267 TESTCASE(chrptr == &input[strlen(input)]);
268 TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
269 c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
270 c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
272 char chrout[strlen(input)];
276 chrrem = strlen(input);
277 TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
278 TESTCASE(c32rem == 0);
279 TESTCASE(chrrem == 0);
280 TESTCASE(c32ptr == &c32out[8]);
281 TESTCASE(chrptr == &chrout[strlen(input)]);
282 TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
284 // Multi-part conversion
285 static const char* mpinput = "\xDF\xBF";
288 chrptr = &mpinput[0];
290 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
291 TESTCASE(c32ptr == &c32out[0]);
292 TESTCASE(c32rem == 8);
293 TESTCASE(chrptr == &mpinput[1]);
294 TESTCASE(chrrem == 0);
296 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
297 TESTCASE(c32ptr == &c32out[1]);
298 TESTCASE(c32rem == 7);
299 TESTCASE(chrptr == &mpinput[2]);
300 TESTCASE(chrrem == 0);
302 // Invalid conversions
305 const char* nul2 = "\xC0\x80";
310 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
311 memset(&mbs, 0, sizeof mbs);
312 const char* nul3 = "\xE0\x80\x80";
317 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
318 memset(&mbs, 0, sizeof mbs);
319 const char* nul4 = "\xF0\x80\x80\x80";
324 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
326 // Starting on a continuation
327 const char* cont = "\x80";
332 TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);