X-Git-Url: https://pd.if.org/git/?a=blobdiff_plain;ds=sidebyside;f=opt%2Fbasecodecs%2F_PDCLIB_utf8.c;fp=opt%2Fbasecodecs%2F_PDCLIB_utf8.c;h=0000000000000000000000000000000000000000;hb=f60269dfbb4671b3bea11d1257de54bef0c9ffef;hp=a4c19adb349a735c6715af6c66641eab61de66b3;hpb=261086b873397b437ad024d44e9467f0fb2586ba;p=pdclib diff --git a/opt/basecodecs/_PDCLIB_utf8.c b/opt/basecodecs/_PDCLIB_utf8.c deleted file mode 100644 index a4c19ad..0000000 --- a/opt/basecodecs/_PDCLIB_utf8.c +++ /dev/null @@ -1,338 +0,0 @@ -/* UTF-8 codec - - This file is part of the Public Domain C Library (PDCLib). - Permission is granted to use, modify, and / or redistribute at will. -*/ - -#ifndef REGTEST -#include -#include -#include -#include -#include "_PDCLIB_encoding.h" - -/* Use of the mbstate: - * - * _StUC[0] is the current decoding state - * _St32[1] is the character accumulated so far - */ - -static bool utf8_mbsinit( const mbstate_t *p_s ) -{ return p_s->_StUC[0] == 0; } - -enum { - DecStart = 0, - - Dec2B2, - - Dec3B2, - Dec3B3, - - Dec4B2, - Dec4B3, - Dec4B4 -}; - -#define state (p_s->_StUC[0]) -#define accum (p_s->_St32[1]) - -#define START_CONVERSION \ - bool result = true; \ - -#define END_CONVERSION \ -end_conversion: \ - return result - -#define FINISH(_r) do { \ - result = (_r); \ - goto end_conversion; \ -} while(0) - -#define OUT32(_c) do { \ - if(p_outbuf) \ - (*((*p_outbuf)++)) = (_c); \ - (*p_outsz)--; \ - _PDCLIB_UNDEFINED(accum); \ - state = DecStart; \ -} while(0) - -#define CHECK_CONTINUATION \ - do { if((c & 0xC0) != 0x80) return false; } while(0) - -static bool utf8toc32( - char32_t *restrict *restrict p_outbuf, - size_t *restrict p_outsz, - const char *restrict *restrict p_inbuf, - size_t *restrict p_insz, - mbstate_t *restrict p_s -) -{ - START_CONVERSION - while(*p_outsz && *p_insz) { - unsigned char c = **p_inbuf; - char32_t c32; - switch(state) { - case DecStart: - // 1 byte - if(c <= 0x7F) { - OUT32(c); - } else if(c <= 0xDF) { - accum = (c & 0x1F) << 6; - state = Dec2B2; - } else if(c <= 0xEF) { - accum = (c & 0x0F) << 12; - state = Dec3B2; - } else if(c <= 0xF4) { - accum = (c & 0x07) << 18; - state = Dec4B2; - } else { - // 5+byte sequence illegal - FINISH(false); - } - break; - - case Dec2B2: - CHECK_CONTINUATION; - - c32 = accum | (c & 0x3F); - - // Overlong sequence (e.g. NUL injection) - if(c32 <= 0x7F) - FINISH(false); - - OUT32(c32); - break; - - case Dec3B2: - CHECK_CONTINUATION; - accum |= (c & 0x3F) << 6; - state = Dec3B3; - break; - - case Dec3B3: - CHECK_CONTINUATION; - - c32 = accum | (c & 0x3F); - - // Overlong - if(c32 <= 0x07FF) - FINISH(false); - - // Surrogate - if(c32 >= 0xD800 && c32 <= 0xDFFF) - FINISH(false); - - OUT32(c32); - break; - - case Dec4B2: - CHECK_CONTINUATION; - accum |= (c & 0x3F) << 12; - state = Dec4B3; - break; - - case Dec4B3: - CHECK_CONTINUATION; - accum |= (c & 0x3F) << 6; - state = Dec4B4; - break; - - case Dec4B4: - CHECK_CONTINUATION; - - c32 = accum | (c & 0x3F); - - // Overlong - if(c32 <= 0xFFFF) FINISH(false); - - // Not in Unicode - if(c32 > 0x10FFFF) FINISH(false); - - OUT32(c32); - break; - - default: - assert(!"Invalid state"); - } - - (*p_inbuf)++; - (*p_insz)--; - } - END_CONVERSION; -} - -enum { - EncStart = 0, - Enc1R, - Enc2R, - Enc3R, -}; - -static bool c32toutf8( - char *restrict *restrict p_outbuf, - size_t *restrict p_outsz, - const char32_t *restrict *restrict p_inbuf, - size_t *restrict p_insz, - mbstate_t *restrict p_s -) -{ - START_CONVERSION - while(*p_outsz) { - unsigned char outc = 0; - switch(state) { - case Enc3R: - outc = 0x80 | ((accum >> 12) & 0x3F); - state = Enc2R; - break; - - case Enc2R: - outc = 0x80 | ((accum >> 6) & 0x3F); - state = Enc1R; - break; - - case Enc1R: - outc = 0x80 | (accum & 0x3F); - state = EncStart; - _PDCLIB_UNDEFINED(accum); - break; - - case EncStart: - if(*p_insz == 0) - FINISH(true); - - accum = **p_inbuf; - (*p_inbuf)++; - (*p_insz)--; - - if(accum <= 0x7F) { - outc = accum; - state = EncStart; - _PDCLIB_UNDEFINED(accum); - } else if(accum <= 0x7FF) { - outc = 0xC0 | (accum >> 6); - state = Enc1R; - } else if(accum <= 0xFFFF) { - outc = 0xE0 | (accum >> 12); - state = Enc2R; - } else if(accum <= 0x10FFFF) { - outc = 0xF0 | (accum >> 18); - state = Enc3R; - } else { - FINISH(false); - } - break; - } - - if(p_outbuf) { - **p_outbuf = outc; - (*p_outbuf)++; - } - (*p_outsz)--; - } - END_CONVERSION; -} - -const struct _PDCLIB_charcodec_t _PDCLIB_utf8_codec = { - .__mbsinit = utf8_mbsinit, - .__mbstoc32s = utf8toc32, - .__c32stombs = c32toutf8, - .__mb_max = 4, -}; - -#endif - -#ifdef TEST -#include "_PDCLIB_test.h" - -int main( void ) -{ -#ifndef REGTEST - // Valid conversion & back - - static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF" - "\xF4\x8F\xBF\xBF"; - - char32_t c32out[8]; - - char32_t *c32ptr = &c32out[0]; - size_t c32rem = 8; - const char *chrptr = (char*) &input[0]; - size_t chrrem = strlen(input); - mbstate_t mbs = { 0 }; - - TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs)); - TESTCASE(c32rem == 0); - TESTCASE(chrrem == 0); - TESTCASE(c32ptr == &c32out[8]); - TESTCASE(chrptr == &input[strlen(input)]); - TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' && - c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF && - c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF); - - char chrout[strlen(input)]; - c32ptr = &c32out[0]; - c32rem = 8; - chrptr = &chrout[0]; - chrrem = strlen(input); - TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs)); - TESTCASE(c32rem == 0); - TESTCASE(chrrem == 0); - TESTCASE(c32ptr == &c32out[8]); - TESTCASE(chrptr == &chrout[strlen(input)]); - TESTCASE(memcmp(chrout, input, strlen(input)) == 0); - - // Multi-part conversion - static const char* mpinput = "\xDF\xBF"; - c32ptr = &c32out[0]; - c32rem = 8; - chrptr = &mpinput[0]; - chrrem = 1; - TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs)); - TESTCASE(c32ptr == &c32out[0]); - TESTCASE(c32rem == 8); - TESTCASE(chrptr == &mpinput[1]); - TESTCASE(chrrem == 0); - chrrem = 1; - TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs)); - TESTCASE(c32ptr == &c32out[1]); - TESTCASE(c32rem == 7); - TESTCASE(chrptr == &mpinput[2]); - TESTCASE(chrrem == 0); - - // Invalid conversions - - // Overlong nuls - const char* nul2 = "\xC0\x80"; - c32ptr = &c32out[0]; - c32rem = 8; - chrptr = &nul2[0]; - chrrem = 2; - TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false); - memset(&mbs, 0, sizeof mbs); - const char* nul3 = "\xE0\x80\x80"; - c32ptr = &c32out[0]; - c32rem = 8; - chrptr = &nul3[0]; - chrrem = 3; - TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false); - memset(&mbs, 0, sizeof mbs); - const char* nul4 = "\xF0\x80\x80\x80"; - c32ptr = &c32out[0]; - c32rem = 8; - chrptr = &nul4[0]; - chrrem = 4; - TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false); - - // Starting on a continuation - const char* cont = "\x80"; - c32ptr = &c32out[0]; - c32rem = 8; - chrptr = &cont[0]; - chrrem = 1; - TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false); -#endif - return TEST_RESULTS; -} - -#endif -