]> pd.if.org Git - pdclib.old/blob - opt/basecodecs/_PDCLIB_utf8.c
PDCLIB-2 PDCLIB-12: If the internal character conversion routines are called
[pdclib.old] / opt / basecodecs / _PDCLIB_utf8.c
1 /* UTF-8 codec
2
3    This file is part of the Public Domain C Library (PDCLib).
4    Permission is granted to use, modify, and / or redistribute at will.
5 */
6
7 #ifndef REGTEST
8 #include <stdbool.h>
9 #include <stdint.h>
10 #include <uchar.h>
11 #include <assert.h>
12
13 /* Use of the mbstate:
14  *
15  * _StUC[0] is the current decoding state
16  * _St32[1] is the character accumulated so far
17  */
18
19 enum {
20     DecStart = 0,
21
22     Dec2B2,
23
24     Dec3B2,
25     Dec3B3,
26
27     Dec4B2,
28     Dec4B3,
29     Dec4B4
30 };
31
32 #define state (p_s->_StUC[0])
33 #define accum (p_s->_St32[1])
34
35 #define START_CONVERSION \
36     bool          result = true;           \
37
38 #define END_CONVERSION      \
39 end_conversion:             \
40     return result
41
42 #define FINISH(_r) do {     \
43     result = (_r);          \
44     goto end_conversion;    \
45 } while(0)
46
47 #define OUT32(_c)  do {             \
48     if(p_outbuf)                    \
49         (*((*p_outbuf)++)) = (_c);  \
50     (*p_outsz)--;                   \
51     _PDCLIB_UNDEFINED(accum);       \
52     state = DecStart;               \
53 } while(0)
54 #define CHECK_CONTINUATION \
55     do { if((c & 0xC0) != 0x80) return false; } while(0)
56
57 static bool utf8toc32(
58     char32_t       **restrict   p_outbuf,
59     size_t          *restrict   p_outsz,
60     const char     **restrict   p_inbuf,
61     size_t          *restrict   p_insz,
62     mbstate_t       *restrict   p_s
63 )
64 {
65     START_CONVERSION
66     while(*p_outsz && *p_insz) {
67         unsigned char c = **p_inbuf;
68         char32_t      c32;
69         switch(state) {
70         case DecStart:
71             // 1 byte 
72             if(c <= 0x7F) {
73                 OUT32(c);
74             } else if(c <= 0xDF) {
75                 accum = (c & 0x1F) << 6;
76                 state = Dec2B2;
77             } else if(c <= 0xEF) {
78                 accum = (c & 0x0F) << 12;
79                 state = Dec3B2;
80             } else if(c <= 0xF4) {
81                 accum = (c & 0x07) << 18;
82                 state = Dec4B2;
83             } else {
84                 // 5+byte sequence illegal
85                 FINISH(false);
86             }
87             break;
88
89         case Dec2B2:
90             CHECK_CONTINUATION;
91
92             c32 = accum | (c & 0x3F);
93
94             // Overlong sequence (e.g. NUL injection)
95             if(c32 <= 0x7F)
96                 FINISH(false);
97
98             OUT32(c32);
99             break;
100
101         case Dec3B2:
102             CHECK_CONTINUATION;
103             accum |= (c & 0x3F) << 6;
104             state = Dec3B3;
105             break;
106
107         case Dec3B3:
108             CHECK_CONTINUATION;
109
110             c32 = accum | (c & 0x3F);
111
112             // Overlong
113             if(c32 <= 0x07FF)
114                 FINISH(false);
115
116             // Surrogate
117             if(c32 >= 0xD800 && c32 <= 0xDFFF)
118                 FINISH(false);
119
120             OUT32(c32);
121             break;
122
123         case Dec4B2:
124             CHECK_CONTINUATION;
125             accum |= (c & 0x3F) << 12;
126             state = Dec4B3;
127             break;
128
129         case Dec4B3:
130             CHECK_CONTINUATION;
131             accum |= (c & 0x3F) << 6;
132             state = Dec4B4;
133             break;
134
135         case Dec4B4:
136             CHECK_CONTINUATION;
137
138             c32 = accum | (c & 0x3F);
139
140             // Overlong
141             if(c32 <= 0xFFFF) FINISH(false);
142
143             // Not in Unicode
144             if(c32 > 0x10FFFF) FINISH(false);
145
146             OUT32(c32);
147             break;
148
149         default:
150             assert(!"Invalid state");
151         }
152
153         (*p_inbuf)++;
154         (*p_insz)--; 
155     }
156     END_CONVERSION;
157 }
158
159 enum {
160     EncStart = 0,
161     Enc1R,
162     Enc2R,
163     Enc3R,
164 };
165
166 static bool c32toutf8(
167     char           **restrict  p_outbuf,
168     size_t          *restrict  p_outsz,
169     const char32_t **restrict  p_inbuf,
170     size_t          *restrict  p_insz,
171     mbstate_t       *restrict  p_s
172 )
173 {
174     START_CONVERSION
175     while(*p_outsz) {
176         unsigned char outc;
177         switch(state) {
178         case Enc3R:
179             outc = 0x80 | ((accum >> 12) & 0x3F);
180             state = Enc2R;
181             break;
182
183         case Enc2R:
184             outc = 0x80 | ((accum >> 6) & 0x3F);
185             state = Enc1R;
186             break;
187
188         case Enc1R:
189             outc = 0x80 | (accum & 0x3F);
190             state = EncStart;
191             _PDCLIB_UNDEFINED(accum);
192             break;
193
194         case EncStart:
195             if(*p_insz == 0)
196                 FINISH(true);
197
198             accum  = **p_inbuf;
199             (*p_inbuf)++;
200             (*p_insz)--;
201
202             if(accum <= 0x7F) {
203                 outc = accum;
204                 state = EncStart;
205                 _PDCLIB_UNDEFINED(accum);
206             } else if(accum <= 0x7FF) {
207                 outc = 0xC0 | (accum >> 6);
208                 state = Enc1R;
209             } else if(accum <= 0xFFFF) {
210                 outc = 0xE0 | (accum >> 12);
211                 state = Enc2R;
212             } else if(accum <= 0x10FFFF) {
213                 outc = 0xF0 | (accum >> 18);
214                 state = Enc3R;
215             } else {
216                 FINISH(false);
217             }
218             break;
219         }
220
221         if(p_outbuf) {
222             **p_outbuf = outc;
223             (*p_outbuf)++; 
224         }
225         (*p_outsz)--;        
226     }
227     END_CONVERSION;
228 }
229
230 _PDCLIB_charcodec _PDCLIB_utf8_codec = {
231     .__mbstoc32s = utf8toc32,
232     .__c32stombs = c32toutf8,
233 };
234
235 #endif
236
237 #ifdef TEST
238 #include <_PDCLIB_test.h>
239
240 int main( void )
241 {
242 #ifndef REGTEST
243     // Valid conversion & back
244
245     static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF" 
246                                "\xF4\x8F\xBF\xBF";
247
248     char32_t c32out[8];
249
250     char32_t *c32ptr = &c32out[0];
251     size_t    c32rem = 8;
252     char     *chrptr = (char*) &input[0];
253     size_t    chrrem = strlen(input);
254     mbstate_t mbs = { 0 };
255
256     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
257     TESTCASE(c32rem == 0);
258     TESTCASE(chrrem == 0);
259     TESTCASE(c32ptr == &c32out[8]);
260     TESTCASE(chrptr == &input[strlen(input)]);
261     TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
262              c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
263              c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
264
265     char chrout[strlen(input)];
266     c32ptr = &c32out[0];
267     c32rem = 8;
268     chrptr = &chrout[0];
269     chrrem = strlen(input);
270     TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
271     TESTCASE(c32rem == 0);
272     TESTCASE(chrrem == 0);
273     TESTCASE(c32ptr == &c32out[8]);
274     TESTCASE(chrptr == &chrout[strlen(input)]);
275     TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
276
277     // Multi-part conversion
278     static const char* mpinput = "\xDF\xBF";
279     c32ptr = &c32out[0];
280     c32rem = 8;
281     chrptr = &mpinput[0];
282     chrrem = 1;
283     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
284     TESTCASE(c32ptr == &c32out[0]);
285     TESTCASE(c32rem == 8);
286     TESTCASE(chrptr == &mpinput[1]);
287     TESTCASE(chrrem == 0);
288     chrrem = 1;
289     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
290     TESTCASE(c32ptr == &c32out[1]);
291     TESTCASE(c32rem == 7);
292     TESTCASE(chrptr == &mpinput[2]);
293     TESTCASE(chrrem == 0);
294
295     // Invalid conversions
296
297     // Overlong nuls
298     const char* nul2 = "\xC0\x80";
299     c32ptr = &c32out[0];
300     c32rem = 8;
301     chrptr = &nul2[0];
302     chrrem = 2;
303     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
304     memset(&mbs, 0, sizeof mbs);
305     const char* nul3 = "\xE0\x80\x80";
306     c32ptr = &c32out[0];
307     c32rem = 8;
308     chrptr = &nul3[0];
309     chrrem = 3;
310     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
311     memset(&mbs, 0, sizeof mbs);
312     const char* nul4 = "\xF0\x80\x80\x80";
313     c32ptr = &c32out[0];
314     c32rem = 8;
315     chrptr = &nul4[0];
316     chrrem = 4;
317     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
318
319     // Starting on a continuation
320     const char* cont = "\x80";
321     c32ptr = &c32out[0];
322     c32rem = 8;
323     chrptr = &cont[0];
324     chrrem = 1;
325     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
326 #endif
327     return TEST_RESULTS;
328 }
329
330 #endif
331