+++ /dev/null
-/* UTF-8 codec
-
- This file is part of the Public Domain C Library (PDCLib).
- Permission is granted to use, modify, and / or redistribute at will.
-*/
-
-#ifndef REGTEST
-#include <stdbool.h>
-#include <stdint.h>
-#include <uchar.h>
-#include <assert.h>
-#include "_PDCLIB_encoding.h"
-
-/* Use of the mbstate:
- *
- * _StUC[0] is the current decoding state
- * _St32[1] is the character accumulated so far
- */
-
-static bool utf8_mbsinit( const mbstate_t *p_s )
-{ return p_s->_StUC[0] == 0; }
-
-enum {
- DecStart = 0,
-
- Dec2B2,
-
- Dec3B2,
- Dec3B3,
-
- Dec4B2,
- Dec4B3,
- Dec4B4
-};
-
-#define state (p_s->_StUC[0])
-#define accum (p_s->_St32[1])
-
-#define START_CONVERSION \
- bool result = true; \
-
-#define END_CONVERSION \
-end_conversion: \
- return result
-
-#define FINISH(_r) do { \
- result = (_r); \
- goto end_conversion; \
-} while(0)
-
-#define OUT32(_c) do { \
- if(p_outbuf) \
- (*((*p_outbuf)++)) = (_c); \
- (*p_outsz)--; \
- _PDCLIB_UNDEFINED(accum); \
- state = DecStart; \
-} while(0)
-
-#define CHECK_CONTINUATION \
- do { if((c & 0xC0) != 0x80) return false; } while(0)
-
-static bool utf8toc32(
- char32_t *restrict *restrict p_outbuf,
- size_t *restrict p_outsz,
- const char *restrict *restrict p_inbuf,
- size_t *restrict p_insz,
- mbstate_t *restrict p_s
-)
-{
- START_CONVERSION
- while(*p_outsz && *p_insz) {
- unsigned char c = **p_inbuf;
- char32_t c32;
- switch(state) {
- case DecStart:
- // 1 byte
- if(c <= 0x7F) {
- OUT32(c);
- } else if(c <= 0xDF) {
- accum = (c & 0x1F) << 6;
- state = Dec2B2;
- } else if(c <= 0xEF) {
- accum = (c & 0x0F) << 12;
- state = Dec3B2;
- } else if(c <= 0xF4) {
- accum = (c & 0x07) << 18;
- state = Dec4B2;
- } else {
- // 5+byte sequence illegal
- FINISH(false);
- }
- break;
-
- case Dec2B2:
- CHECK_CONTINUATION;
-
- c32 = accum | (c & 0x3F);
-
- // Overlong sequence (e.g. NUL injection)
- if(c32 <= 0x7F)
- FINISH(false);
-
- OUT32(c32);
- break;
-
- case Dec3B2:
- CHECK_CONTINUATION;
- accum |= (c & 0x3F) << 6;
- state = Dec3B3;
- break;
-
- case Dec3B3:
- CHECK_CONTINUATION;
-
- c32 = accum | (c & 0x3F);
-
- // Overlong
- if(c32 <= 0x07FF)
- FINISH(false);
-
- // Surrogate
- if(c32 >= 0xD800 && c32 <= 0xDFFF)
- FINISH(false);
-
- OUT32(c32);
- break;
-
- case Dec4B2:
- CHECK_CONTINUATION;
- accum |= (c & 0x3F) << 12;
- state = Dec4B3;
- break;
-
- case Dec4B3:
- CHECK_CONTINUATION;
- accum |= (c & 0x3F) << 6;
- state = Dec4B4;
- break;
-
- case Dec4B4:
- CHECK_CONTINUATION;
-
- c32 = accum | (c & 0x3F);
-
- // Overlong
- if(c32 <= 0xFFFF) FINISH(false);
-
- // Not in Unicode
- if(c32 > 0x10FFFF) FINISH(false);
-
- OUT32(c32);
- break;
-
- default:
- assert(!"Invalid state");
- }
-
- (*p_inbuf)++;
- (*p_insz)--;
- }
- END_CONVERSION;
-}
-
-enum {
- EncStart = 0,
- Enc1R,
- Enc2R,
- Enc3R,
-};
-
-static bool c32toutf8(
- char *restrict *restrict p_outbuf,
- size_t *restrict p_outsz,
- const char32_t *restrict *restrict p_inbuf,
- size_t *restrict p_insz,
- mbstate_t *restrict p_s
-)
-{
- START_CONVERSION
- while(*p_outsz) {
- unsigned char outc = 0;
- switch(state) {
- case Enc3R:
- outc = 0x80 | ((accum >> 12) & 0x3F);
- state = Enc2R;
- break;
-
- case Enc2R:
- outc = 0x80 | ((accum >> 6) & 0x3F);
- state = Enc1R;
- break;
-
- case Enc1R:
- outc = 0x80 | (accum & 0x3F);
- state = EncStart;
- _PDCLIB_UNDEFINED(accum);
- break;
-
- case EncStart:
- if(*p_insz == 0)
- FINISH(true);
-
- accum = **p_inbuf;
- (*p_inbuf)++;
- (*p_insz)--;
-
- if(accum <= 0x7F) {
- outc = accum;
- state = EncStart;
- _PDCLIB_UNDEFINED(accum);
- } else if(accum <= 0x7FF) {
- outc = 0xC0 | (accum >> 6);
- state = Enc1R;
- } else if(accum <= 0xFFFF) {
- outc = 0xE0 | (accum >> 12);
- state = Enc2R;
- } else if(accum <= 0x10FFFF) {
- outc = 0xF0 | (accum >> 18);
- state = Enc3R;
- } else {
- FINISH(false);
- }
- break;
- }
-
- if(p_outbuf) {
- **p_outbuf = outc;
- (*p_outbuf)++;
- }
- (*p_outsz)--;
- }
- END_CONVERSION;
-}
-
-const struct _PDCLIB_charcodec_t _PDCLIB_utf8_codec = {
- .__mbsinit = utf8_mbsinit,
- .__mbstoc32s = utf8toc32,
- .__c32stombs = c32toutf8,
- .__mb_max = 4,
-};
-
-#endif
-
-#ifdef TEST
-#include "_PDCLIB_test.h"
-
-int main( void )
-{
-#ifndef REGTEST
- // Valid conversion & back
-
- static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF"
- "\xF4\x8F\xBF\xBF";
-
- char32_t c32out[8];
-
- char32_t *c32ptr = &c32out[0];
- size_t c32rem = 8;
- const char *chrptr = (char*) &input[0];
- size_t chrrem = strlen(input);
- mbstate_t mbs = { 0 };
-
- TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
- TESTCASE(c32rem == 0);
- TESTCASE(chrrem == 0);
- TESTCASE(c32ptr == &c32out[8]);
- TESTCASE(chrptr == &input[strlen(input)]);
- TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
- c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
- c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
-
- char chrout[strlen(input)];
- c32ptr = &c32out[0];
- c32rem = 8;
- chrptr = &chrout[0];
- chrrem = strlen(input);
- TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
- TESTCASE(c32rem == 0);
- TESTCASE(chrrem == 0);
- TESTCASE(c32ptr == &c32out[8]);
- TESTCASE(chrptr == &chrout[strlen(input)]);
- TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
-
- // Multi-part conversion
- static const char* mpinput = "\xDF\xBF";
- c32ptr = &c32out[0];
- c32rem = 8;
- chrptr = &mpinput[0];
- chrrem = 1;
- TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
- TESTCASE(c32ptr == &c32out[0]);
- TESTCASE(c32rem == 8);
- TESTCASE(chrptr == &mpinput[1]);
- TESTCASE(chrrem == 0);
- chrrem = 1;
- TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
- TESTCASE(c32ptr == &c32out[1]);
- TESTCASE(c32rem == 7);
- TESTCASE(chrptr == &mpinput[2]);
- TESTCASE(chrrem == 0);
-
- // Invalid conversions
-
- // Overlong nuls
- const char* nul2 = "\xC0\x80";
- c32ptr = &c32out[0];
- c32rem = 8;
- chrptr = &nul2[0];
- chrrem = 2;
- TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
- memset(&mbs, 0, sizeof mbs);
- const char* nul3 = "\xE0\x80\x80";
- c32ptr = &c32out[0];
- c32rem = 8;
- chrptr = &nul3[0];
- chrrem = 3;
- TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
- memset(&mbs, 0, sizeof mbs);
- const char* nul4 = "\xF0\x80\x80\x80";
- c32ptr = &c32out[0];
- c32rem = 8;
- chrptr = &nul4[0];
- chrrem = 4;
- TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
-
- // Starting on a continuation
- const char* cont = "\x80";
- c32ptr = &c32out[0];
- c32rem = 8;
- chrptr = &cont[0];
- chrrem = 1;
- TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
-#endif
- return TEST_RESULTS;
-}
-
-#endif
-