--- /dev/null
+/* UTF-8 codec
+
+ This file is part of the Public Domain C Library (PDCLib).
+ Permission is granted to use, modify, and / or redistribute at will.
+*/
+
+#ifndef REGTEST
+#include <stdbool.h>
+#include <stdint.h>
+#include <uchar.h>
+#include <assert.h>
+
+/* Use of the mbstate:
+ *
+ * _StUC[0] is the current decoding state
+ * _St32[1] is the character accumulated so far
+ */
+
+enum {
+ DecStart = 0,
+
+ Dec2B2,
+
+ Dec3B2,
+ Dec3B3,
+
+ Dec4B2,
+ Dec4B3,
+ Dec4B4
+};
+
+#define state (p_s->_StUC[0])
+#define accum (p_s->_St32[1])
+
+#define START_CONVERSION \
+ bool result = true; \
+
+#define END_CONVERSION \
+end_conversion: \
+ return result
+
+#define FINISH(_r) do { \
+ result = (_r); \
+ goto end_conversion; \
+} while(0)
+
+#define OUT32(_c) do { \
+ (*((*p_outbuf)++)) = (_c); \
+ (*p_outsz)--; \
+ _PDCLIB_UNDEFINED(accum); \
+ state = DecStart; \
+} while(0)
+#define CHECK_CONTINUATION \
+ do { if((c & 0xC0) != 0x80) return false; } while(0)
+
+static bool utf8toc32(
+ char32_t **restrict p_outbuf,
+ size_t *restrict p_outsz,
+ const char **restrict p_inbuf,
+ size_t *restrict p_insz,
+ mbstate_t *restrict p_s
+)
+{
+ START_CONVERSION
+ while(*p_outsz && *p_insz) {
+ unsigned char c = **p_inbuf;
+ char32_t c32;
+ switch(state) {
+ case DecStart:
+ // 1 byte
+ if(c <= 0x7F) {
+ OUT32(c);
+ } else if(c <= 0xDF) {
+ accum = (c & 0x1F) << 6;
+ state = Dec2B2;
+ } else if(c <= 0xEF) {
+ accum = (c & 0x0F) << 12;
+ state = Dec3B2;
+ } else if(c <= 0xF4) {
+ accum = (c & 0x07) << 18;
+ state = Dec4B2;
+ } else {
+ // 5+byte sequence illegal
+ FINISH(false);
+ }
+ break;
+
+ case Dec2B2:
+ CHECK_CONTINUATION;
+
+ c32 = accum | (c & 0x3F);
+
+ // Overlong sequence (e.g. NUL injection)
+ if(c32 <= 0x7F)
+ FINISH(false);
+
+ OUT32(c32);
+ break;
+
+ case Dec3B2:
+ CHECK_CONTINUATION;
+ accum |= (c & 0x3F) << 6;
+ state = Dec3B3;
+ break;
+
+ case Dec3B3:
+ CHECK_CONTINUATION;
+
+ c32 = accum | (c & 0x3F);
+
+ // Overlong
+ if(c32 <= 0x07FF)
+ FINISH(false);
+
+ // Surrogate
+ if(c32 >= 0xD800 && c32 <= 0xDFFF)
+ FINISH(false);
+
+ OUT32(c32);
+ break;
+
+ case Dec4B2:
+ CHECK_CONTINUATION;
+ accum |= (c & 0x3F) << 12;
+ state = Dec4B3;
+ break;
+
+ case Dec4B3:
+ CHECK_CONTINUATION;
+ accum |= (c & 0x3F) << 6;
+ state = Dec4B4;
+ break;
+
+ case Dec4B4:
+ CHECK_CONTINUATION;
+
+ c32 = accum | (c & 0x3F);
+
+ // Overlong
+ if(c32 <= 0xFFFF) FINISH(false);
+
+ // Not in Unicode
+ if(c32 > 0x10FFFF) FINISH(false);
+
+ OUT32(c32);
+ break;
+
+ default:
+ assert(!"Invalid state");
+ }
+
+ (*p_inbuf)++;
+ (*p_insz)--;
+ }
+ END_CONVERSION;
+}
+
+enum {
+ EncStart = 0,
+ Enc1R,
+ Enc2R,
+ Enc3R,
+};
+
+static bool c32toutf8(
+ char **restrict p_outbuf,
+ size_t *restrict p_outsz,
+ const char32_t **restrict p_inbuf,
+ size_t *restrict p_insz,
+ mbstate_t *restrict p_s
+)
+{
+ START_CONVERSION
+ while(*p_outsz) {
+ char *c8 = *p_outbuf;
+ switch(state) {
+ case Enc3R:
+ *c8 = 0x80 | ((accum >> 12) & 0x3F);
+ state = Enc2R;
+ break;
+
+ case Enc2R:
+ *c8 = 0x80 | ((accum >> 6) & 0x3F);
+ state = Enc1R;
+ break;
+
+ case Enc1R:
+ *c8 = 0x80 | (accum & 0x3F);
+ state = EncStart;
+ _PDCLIB_UNDEFINED(accum);
+ break;
+
+ case EncStart:
+ if(*p_insz == 0)
+ FINISH(true);
+
+ accum = **p_inbuf;
+ (*p_inbuf)++;
+ (*p_insz)--;
+
+ if(accum <= 0x7F) {
+ *c8 = accum;
+ state = EncStart;
+ _PDCLIB_UNDEFINED(accum);
+ } else if(accum <= 0x7FF) {
+ *c8 = 0xC0 | (accum >> 6);
+ state = Enc1R;
+ } else if(accum <= 0xFFFF) {
+ *c8 = 0xE0 | (accum >> 12);
+ state = Enc2R;
+ } else if(accum <= 0x10FFFF) {
+ *c8 = 0xF0 | (accum >> 18);
+ state = Enc3R;
+ } else {
+ FINISH(false);
+ }
+ break;
+ }
+
+
+ (*p_outbuf)++;
+ (*p_outsz)--;
+ }
+ END_CONVERSION;
+}
+#endif
+
+#ifdef TEST
+#include <_PDCLIB_test.h>
+
+int main( void )
+{
+#ifndef REGTEST
+ // Valid conversion & back
+
+ static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF"
+ "\xF4\x8F\xBF\xBF";
+
+ char32_t c32out[8];
+
+ char32_t *c32ptr = &c32out[0];
+ size_t c32rem = 8;
+ char *chrptr = (char*) &input[0];
+ size_t chrrem = strlen(input);
+ mbstate_t mbs = { 0 };
+
+ TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
+ TESTCASE(c32rem == 0);
+ TESTCASE(chrrem == 0);
+ TESTCASE(c32ptr == &c32out[8]);
+ TESTCASE(chrptr == &input[strlen(input)]);
+ TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
+ c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
+ c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
+
+ char chrout[strlen(input)];
+ c32ptr = &c32out[0];
+ c32rem = 8;
+ chrptr = &chrout[0];
+ chrrem = strlen(input);
+ TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
+ TESTCASE(c32rem == 0);
+ TESTCASE(chrrem == 0);
+ TESTCASE(c32ptr == &c32out[8]);
+ TESTCASE(chrptr == &chrout[strlen(input)]);
+ TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
+
+ // Multi-part conversion
+ static const char* mpinput = "\xDF\xBF";
+ c32ptr = &c32out[0];
+ c32rem = 8;
+ chrptr = &mpinput[0];
+ chrrem = 1;
+ TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
+ TESTCASE(c32ptr == &c32out[0]);
+ TESTCASE(c32rem == 8);
+ TESTCASE(chrptr == &mpinput[1]);
+ TESTCASE(chrrem == 0);
+ chrrem = 1;
+ TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
+ TESTCASE(c32ptr == &c32out[1]);
+ TESTCASE(c32rem == 7);
+ TESTCASE(chrptr == &mpinput[2]);
+ TESTCASE(chrrem == 0);
+
+ // Invalid conversions
+
+ // Overlong nuls
+ const char* nul2 = "\xC0\x80";
+ c32ptr = &c32out[0];
+ c32rem = 8;
+ chrptr = &nul2[0];
+ chrrem = 2;
+ TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
+ memset(&mbs, 0, sizeof mbs);
+ const char* nul3 = "\xE0\x80\x80";
+ c32ptr = &c32out[0];
+ c32rem = 8;
+ chrptr = &nul3[0];
+ chrrem = 3;
+ TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
+ memset(&mbs, 0, sizeof mbs);
+ const char* nul4 = "\xF0\x80\x80\x80";
+ c32ptr = &c32out[0];
+ c32rem = 8;
+ chrptr = &nul4[0];
+ chrrem = 4;
+ TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
+
+ // Starting on a continuation
+ const char* cont = "\x80";
+ c32ptr = &c32out[0];
+ c32rem = 8;
+ chrptr = &cont[0];
+ chrrem = 1;
+ TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
+#endif
+ return TEST_RESULTS;
+}
+
+#endif
+