]> pd.if.org Git - pdclib/blobdiff - opt/basecodecs/_PDCLIB_utf8.c
Moved base codecs to main codebase.
[pdclib] / opt / basecodecs / _PDCLIB_utf8.c
diff --git a/opt/basecodecs/_PDCLIB_utf8.c b/opt/basecodecs/_PDCLIB_utf8.c
deleted file mode 100644 (file)
index a4c19ad..0000000
+++ /dev/null
@@ -1,338 +0,0 @@
-/* UTF-8 codec
-
-   This file is part of the Public Domain C Library (PDCLib).
-   Permission is granted to use, modify, and / or redistribute at will.
-*/
-
-#ifndef REGTEST
-#include <stdbool.h>
-#include <stdint.h>
-#include <uchar.h>
-#include <assert.h>
-#include "_PDCLIB_encoding.h"
-
-/* Use of the mbstate:
- *
- * _StUC[0] is the current decoding state
- * _St32[1] is the character accumulated so far
- */
-
-static bool utf8_mbsinit( const mbstate_t *p_s )
-{ return p_s->_StUC[0] == 0; }
-
-enum {
-    DecStart = 0,
-
-    Dec2B2,
-
-    Dec3B2,
-    Dec3B3,
-
-    Dec4B2,
-    Dec4B3,
-    Dec4B4
-};
-
-#define state (p_s->_StUC[0])
-#define accum (p_s->_St32[1])
-
-#define START_CONVERSION \
-    bool          result = true;           \
-
-#define END_CONVERSION      \
-end_conversion:             \
-    return result
-
-#define FINISH(_r) do {     \
-    result = (_r);          \
-    goto end_conversion;    \
-} while(0)
-
-#define OUT32(_c)  do {             \
-    if(p_outbuf)                    \
-        (*((*p_outbuf)++)) = (_c);  \
-    (*p_outsz)--;                   \
-    _PDCLIB_UNDEFINED(accum);       \
-    state = DecStart;               \
-} while(0)
-
-#define CHECK_CONTINUATION \
-    do { if((c & 0xC0) != 0x80) return false; } while(0)
-
-static bool utf8toc32(
-    char32_t       *restrict *restrict   p_outbuf,
-    size_t                   *restrict   p_outsz,
-    const char     *restrict *restrict   p_inbuf,
-    size_t                   *restrict   p_insz,
-    mbstate_t                *restrict   p_s
-)
-{
-    START_CONVERSION
-    while(*p_outsz && *p_insz) {
-        unsigned char c = **p_inbuf;
-        char32_t      c32;
-        switch(state) {
-        case DecStart:
-            // 1 byte
-            if(c <= 0x7F) {
-                OUT32(c);
-            } else if(c <= 0xDF) {
-                accum = (c & 0x1F) << 6;
-                state = Dec2B2;
-            } else if(c <= 0xEF) {
-                accum = (c & 0x0F) << 12;
-                state = Dec3B2;
-            } else if(c <= 0xF4) {
-                accum = (c & 0x07) << 18;
-                state = Dec4B2;
-            } else {
-                // 5+byte sequence illegal
-                FINISH(false);
-            }
-            break;
-
-        case Dec2B2:
-            CHECK_CONTINUATION;
-
-            c32 = accum | (c & 0x3F);
-
-            // Overlong sequence (e.g. NUL injection)
-            if(c32 <= 0x7F)
-                FINISH(false);
-
-            OUT32(c32);
-            break;
-
-        case Dec3B2:
-            CHECK_CONTINUATION;
-            accum |= (c & 0x3F) << 6;
-            state = Dec3B3;
-            break;
-
-        case Dec3B3:
-            CHECK_CONTINUATION;
-
-            c32 = accum | (c & 0x3F);
-
-            // Overlong
-            if(c32 <= 0x07FF)
-                FINISH(false);
-
-            // Surrogate
-            if(c32 >= 0xD800 && c32 <= 0xDFFF)
-                FINISH(false);
-
-            OUT32(c32);
-            break;
-
-        case Dec4B2:
-            CHECK_CONTINUATION;
-            accum |= (c & 0x3F) << 12;
-            state = Dec4B3;
-            break;
-
-        case Dec4B3:
-            CHECK_CONTINUATION;
-            accum |= (c & 0x3F) << 6;
-            state = Dec4B4;
-            break;
-
-        case Dec4B4:
-            CHECK_CONTINUATION;
-
-            c32 = accum | (c & 0x3F);
-
-            // Overlong
-            if(c32 <= 0xFFFF) FINISH(false);
-
-            // Not in Unicode
-            if(c32 > 0x10FFFF) FINISH(false);
-
-            OUT32(c32);
-            break;
-
-        default:
-            assert(!"Invalid state");
-        }
-
-        (*p_inbuf)++;
-        (*p_insz)--;
-    }
-    END_CONVERSION;
-}
-
-enum {
-    EncStart = 0,
-    Enc1R,
-    Enc2R,
-    Enc3R,
-};
-
-static bool c32toutf8(
-    char           *restrict *restrict  p_outbuf,
-    size_t                   *restrict  p_outsz,
-    const char32_t *restrict *restrict  p_inbuf,
-    size_t                   *restrict  p_insz,
-    mbstate_t                *restrict  p_s
-)
-{
-    START_CONVERSION
-    while(*p_outsz) {
-        unsigned char outc = 0;
-        switch(state) {
-        case Enc3R:
-            outc = 0x80 | ((accum >> 12) & 0x3F);
-            state = Enc2R;
-            break;
-
-        case Enc2R:
-            outc = 0x80 | ((accum >> 6) & 0x3F);
-            state = Enc1R;
-            break;
-
-        case Enc1R:
-            outc = 0x80 | (accum & 0x3F);
-            state = EncStart;
-            _PDCLIB_UNDEFINED(accum);
-            break;
-
-        case EncStart:
-            if(*p_insz == 0)
-                FINISH(true);
-
-            accum  = **p_inbuf;
-            (*p_inbuf)++;
-            (*p_insz)--;
-
-            if(accum <= 0x7F) {
-                outc = accum;
-                state = EncStart;
-                _PDCLIB_UNDEFINED(accum);
-            } else if(accum <= 0x7FF) {
-                outc = 0xC0 | (accum >> 6);
-                state = Enc1R;
-            } else if(accum <= 0xFFFF) {
-                outc = 0xE0 | (accum >> 12);
-                state = Enc2R;
-            } else if(accum <= 0x10FFFF) {
-                outc = 0xF0 | (accum >> 18);
-                state = Enc3R;
-            } else {
-                FINISH(false);
-            }
-            break;
-        }
-
-        if(p_outbuf) {
-            **p_outbuf = outc;
-            (*p_outbuf)++;
-        }
-        (*p_outsz)--;
-    }
-    END_CONVERSION;
-}
-
-const struct _PDCLIB_charcodec_t _PDCLIB_utf8_codec = {
-    .__mbsinit   = utf8_mbsinit,
-    .__mbstoc32s = utf8toc32,
-    .__c32stombs = c32toutf8,
-    .__mb_max    = 4,
-};
-
-#endif
-
-#ifdef TEST
-#include "_PDCLIB_test.h"
-
-int main( void )
-{
-#ifndef REGTEST
-    // Valid conversion & back
-
-    static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF"
-                               "\xF4\x8F\xBF\xBF";
-
-    char32_t c32out[8];
-
-    char32_t   *c32ptr = &c32out[0];
-    size_t      c32rem = 8;
-    const char *chrptr = (char*) &input[0];
-    size_t      chrrem = strlen(input);
-    mbstate_t   mbs = { 0 };
-
-    TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
-    TESTCASE(c32rem == 0);
-    TESTCASE(chrrem == 0);
-    TESTCASE(c32ptr == &c32out[8]);
-    TESTCASE(chrptr == &input[strlen(input)]);
-    TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
-             c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
-             c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
-
-    char chrout[strlen(input)];
-    c32ptr = &c32out[0];
-    c32rem = 8;
-    chrptr = &chrout[0];
-    chrrem = strlen(input);
-    TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
-    TESTCASE(c32rem == 0);
-    TESTCASE(chrrem == 0);
-    TESTCASE(c32ptr == &c32out[8]);
-    TESTCASE(chrptr == &chrout[strlen(input)]);
-    TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
-
-    // Multi-part conversion
-    static const char* mpinput = "\xDF\xBF";
-    c32ptr = &c32out[0];
-    c32rem = 8;
-    chrptr = &mpinput[0];
-    chrrem = 1;
-    TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
-    TESTCASE(c32ptr == &c32out[0]);
-    TESTCASE(c32rem == 8);
-    TESTCASE(chrptr == &mpinput[1]);
-    TESTCASE(chrrem == 0);
-    chrrem = 1;
-    TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
-    TESTCASE(c32ptr == &c32out[1]);
-    TESTCASE(c32rem == 7);
-    TESTCASE(chrptr == &mpinput[2]);
-    TESTCASE(chrrem == 0);
-
-    // Invalid conversions
-
-    // Overlong nuls
-    const char* nul2 = "\xC0\x80";
-    c32ptr = &c32out[0];
-    c32rem = 8;
-    chrptr = &nul2[0];
-    chrrem = 2;
-    TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
-    memset(&mbs, 0, sizeof mbs);
-    const char* nul3 = "\xE0\x80\x80";
-    c32ptr = &c32out[0];
-    c32rem = 8;
-    chrptr = &nul3[0];
-    chrrem = 3;
-    TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
-    memset(&mbs, 0, sizeof mbs);
-    const char* nul4 = "\xF0\x80\x80\x80";
-    c32ptr = &c32out[0];
-    c32rem = 8;
-    chrptr = &nul4[0];
-    chrrem = 4;
-    TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
-
-    // Starting on a continuation
-    const char* cont = "\x80";
-    c32ptr = &c32out[0];
-    c32rem = 8;
-    chrptr = &cont[0];
-    chrrem = 1;
-    TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
-#endif
-    return TEST_RESULTS;
-}
-
-#endif
-