From 3d8dbf37294136263403973fec89f50ab9eca402 Mon Sep 17 00:00:00 2001 From: Owen Shepherd Date: Mon, 31 Dec 2012 00:29:22 +0000 Subject: [PATCH] PDCLIB-2 PDCLIB-12: UTF-8 codec --- opt/basecodecs/_PDCLIB_utf8.c | 322 ++++++++++++++++++++++++++++++++++ 1 file changed, 322 insertions(+) create mode 100644 opt/basecodecs/_PDCLIB_utf8.c diff --git a/opt/basecodecs/_PDCLIB_utf8.c b/opt/basecodecs/_PDCLIB_utf8.c new file mode 100644 index 0000000..a88f964 --- /dev/null +++ b/opt/basecodecs/_PDCLIB_utf8.c @@ -0,0 +1,322 @@ +/* UTF-8 codec + + This file is part of the Public Domain C Library (PDCLib). + Permission is granted to use, modify, and / or redistribute at will. +*/ + +#ifndef REGTEST +#include +#include +#include +#include + +/* Use of the mbstate: + * + * _StUC[0] is the current decoding state + * _St32[1] is the character accumulated so far + */ + +enum { + DecStart = 0, + + Dec2B2, + + Dec3B2, + Dec3B3, + + Dec4B2, + Dec4B3, + Dec4B4 +}; + +#define state (p_s->_StUC[0]) +#define accum (p_s->_St32[1]) + +#define START_CONVERSION \ + bool result = true; \ + +#define END_CONVERSION \ +end_conversion: \ + return result + +#define FINISH(_r) do { \ + result = (_r); \ + goto end_conversion; \ +} while(0) + +#define OUT32(_c) do { \ + (*((*p_outbuf)++)) = (_c); \ + (*p_outsz)--; \ + _PDCLIB_UNDEFINED(accum); \ + state = DecStart; \ +} while(0) +#define CHECK_CONTINUATION \ + do { if((c & 0xC0) != 0x80) return false; } while(0) + +static bool utf8toc32( + char32_t **restrict p_outbuf, + size_t *restrict p_outsz, + const char **restrict p_inbuf, + size_t *restrict p_insz, + mbstate_t *restrict p_s +) +{ + START_CONVERSION + while(*p_outsz && *p_insz) { + unsigned char c = **p_inbuf; + char32_t c32; + switch(state) { + case DecStart: + // 1 byte + if(c <= 0x7F) { + OUT32(c); + } else if(c <= 0xDF) { + accum = (c & 0x1F) << 6; + state = Dec2B2; + } else if(c <= 0xEF) { + accum = (c & 0x0F) << 12; + state = Dec3B2; + } else if(c <= 0xF4) { + accum = (c & 0x07) << 18; + state = Dec4B2; + } else { + // 5+byte sequence illegal + FINISH(false); + } + break; + + case Dec2B2: + CHECK_CONTINUATION; + + c32 = accum | (c & 0x3F); + + // Overlong sequence (e.g. NUL injection) + if(c32 <= 0x7F) + FINISH(false); + + OUT32(c32); + break; + + case Dec3B2: + CHECK_CONTINUATION; + accum |= (c & 0x3F) << 6; + state = Dec3B3; + break; + + case Dec3B3: + CHECK_CONTINUATION; + + c32 = accum | (c & 0x3F); + + // Overlong + if(c32 <= 0x07FF) + FINISH(false); + + // Surrogate + if(c32 >= 0xD800 && c32 <= 0xDFFF) + FINISH(false); + + OUT32(c32); + break; + + case Dec4B2: + CHECK_CONTINUATION; + accum |= (c & 0x3F) << 12; + state = Dec4B3; + break; + + case Dec4B3: + CHECK_CONTINUATION; + accum |= (c & 0x3F) << 6; + state = Dec4B4; + break; + + case Dec4B4: + CHECK_CONTINUATION; + + c32 = accum | (c & 0x3F); + + // Overlong + if(c32 <= 0xFFFF) FINISH(false); + + // Not in Unicode + if(c32 > 0x10FFFF) FINISH(false); + + OUT32(c32); + break; + + default: + assert(!"Invalid state"); + } + + (*p_inbuf)++; + (*p_insz)--; + } + END_CONVERSION; +} + +enum { + EncStart = 0, + Enc1R, + Enc2R, + Enc3R, +}; + +static bool c32toutf8( + char **restrict p_outbuf, + size_t *restrict p_outsz, + const char32_t **restrict p_inbuf, + size_t *restrict p_insz, + mbstate_t *restrict p_s +) +{ + START_CONVERSION + while(*p_outsz) { + char *c8 = *p_outbuf; + switch(state) { + case Enc3R: + *c8 = 0x80 | ((accum >> 12) & 0x3F); + state = Enc2R; + break; + + case Enc2R: + *c8 = 0x80 | ((accum >> 6) & 0x3F); + state = Enc1R; + break; + + case Enc1R: + *c8 = 0x80 | (accum & 0x3F); + state = EncStart; + _PDCLIB_UNDEFINED(accum); + break; + + case EncStart: + if(*p_insz == 0) + FINISH(true); + + accum = **p_inbuf; + (*p_inbuf)++; + (*p_insz)--; + + if(accum <= 0x7F) { + *c8 = accum; + state = EncStart; + _PDCLIB_UNDEFINED(accum); + } else if(accum <= 0x7FF) { + *c8 = 0xC0 | (accum >> 6); + state = Enc1R; + } else if(accum <= 0xFFFF) { + *c8 = 0xE0 | (accum >> 12); + state = Enc2R; + } else if(accum <= 0x10FFFF) { + *c8 = 0xF0 | (accum >> 18); + state = Enc3R; + } else { + FINISH(false); + } + break; + } + + + (*p_outbuf)++; + (*p_outsz)--; + } + END_CONVERSION; +} +#endif + +#ifdef TEST +#include <_PDCLIB_test.h> + +int main( void ) +{ +#ifndef REGTEST + // Valid conversion & back + + static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF" + "\xF4\x8F\xBF\xBF"; + + char32_t c32out[8]; + + char32_t *c32ptr = &c32out[0]; + size_t c32rem = 8; + char *chrptr = (char*) &input[0]; + size_t chrrem = strlen(input); + mbstate_t mbs = { 0 }; + + TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs)); + TESTCASE(c32rem == 0); + TESTCASE(chrrem == 0); + TESTCASE(c32ptr == &c32out[8]); + TESTCASE(chrptr == &input[strlen(input)]); + TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' && + c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF && + c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF); + + char chrout[strlen(input)]; + c32ptr = &c32out[0]; + c32rem = 8; + chrptr = &chrout[0]; + chrrem = strlen(input); + TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs)); + TESTCASE(c32rem == 0); + TESTCASE(chrrem == 0); + TESTCASE(c32ptr == &c32out[8]); + TESTCASE(chrptr == &chrout[strlen(input)]); + TESTCASE(memcmp(chrout, input, strlen(input)) == 0); + + // Multi-part conversion + static const char* mpinput = "\xDF\xBF"; + c32ptr = &c32out[0]; + c32rem = 8; + chrptr = &mpinput[0]; + chrrem = 1; + TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs)); + TESTCASE(c32ptr == &c32out[0]); + TESTCASE(c32rem == 8); + TESTCASE(chrptr == &mpinput[1]); + TESTCASE(chrrem == 0); + chrrem = 1; + TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs)); + TESTCASE(c32ptr == &c32out[1]); + TESTCASE(c32rem == 7); + TESTCASE(chrptr == &mpinput[2]); + TESTCASE(chrrem == 0); + + // Invalid conversions + + // Overlong nuls + const char* nul2 = "\xC0\x80"; + c32ptr = &c32out[0]; + c32rem = 8; + chrptr = &nul2[0]; + chrrem = 2; + TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false); + memset(&mbs, 0, sizeof mbs); + const char* nul3 = "\xE0\x80\x80"; + c32ptr = &c32out[0]; + c32rem = 8; + chrptr = &nul3[0]; + chrrem = 3; + TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false); + memset(&mbs, 0, sizeof mbs); + const char* nul4 = "\xF0\x80\x80\x80"; + c32ptr = &c32out[0]; + c32rem = 8; + chrptr = &nul4[0]; + chrrem = 4; + TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false); + + // Starting on a continuation + const char* cont = "\x80"; + c32ptr = &c32out[0]; + c32rem = 8; + chrptr = &cont[0]; + chrrem = 1; + TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false); +#endif + return TEST_RESULTS; +} + +#endif + -- 2.40.0