]> pd.if.org Git - pdclib.old/blob - opt/basecodecs/_PDCLIB_utf8.c
65f76e01470ba3c337382e15c322e847531ffd6d
[pdclib.old] / opt / basecodecs / _PDCLIB_utf8.c
1 /* UTF-8 codec
2
3    This file is part of the Public Domain C Library (PDCLib).
4    Permission is granted to use, modify, and / or redistribute at will.
5 */
6
7 #ifndef REGTEST
8 #include <stdbool.h>
9 #include <stdint.h>
10 #include <uchar.h>
11 #include <assert.h>
12 #include <_PDCLIB_encoding.h>
13
14 /* Use of the mbstate:
15  *
16  * _StUC[0] is the current decoding state
17  * _St32[1] is the character accumulated so far
18  */
19
20 enum {
21     DecStart = 0,
22
23     Dec2B2,
24
25     Dec3B2,
26     Dec3B3,
27
28     Dec4B2,
29     Dec4B3,
30     Dec4B4
31 };
32
33 #define state (p_s->_StUC[0])
34 #define accum (p_s->_St32[1])
35
36 #define START_CONVERSION \
37     bool          result = true;           \
38
39 #define END_CONVERSION      \
40 end_conversion:             \
41     return result
42
43 #define FINISH(_r) do {     \
44     result = (_r);          \
45     goto end_conversion;    \
46 } while(0)
47
48 #define OUT32(_c)  do {             \
49     if(p_outbuf)                    \
50         (*((*p_outbuf)++)) = (_c);  \
51     (*p_outsz)--;                   \
52     _PDCLIB_UNDEFINED(accum);       \
53     state = DecStart;               \
54 } while(0)
55 #define CHECK_CONTINUATION \
56     do { if((c & 0xC0) != 0x80) return false; } while(0)
57
58 static bool utf8toc32(
59     char32_t       **restrict   p_outbuf,
60     size_t          *restrict   p_outsz,
61     const char     **restrict   p_inbuf,
62     size_t          *restrict   p_insz,
63     mbstate_t       *restrict   p_s
64 )
65 {
66     START_CONVERSION
67     while(*p_outsz && *p_insz) {
68         unsigned char c = **p_inbuf;
69         char32_t      c32;
70         switch(state) {
71         case DecStart:
72             // 1 byte 
73             if(c <= 0x7F) {
74                 OUT32(c);
75             } else if(c <= 0xDF) {
76                 accum = (c & 0x1F) << 6;
77                 state = Dec2B2;
78             } else if(c <= 0xEF) {
79                 accum = (c & 0x0F) << 12;
80                 state = Dec3B2;
81             } else if(c <= 0xF4) {
82                 accum = (c & 0x07) << 18;
83                 state = Dec4B2;
84             } else {
85                 // 5+byte sequence illegal
86                 FINISH(false);
87             }
88             break;
89
90         case Dec2B2:
91             CHECK_CONTINUATION;
92
93             c32 = accum | (c & 0x3F);
94
95             // Overlong sequence (e.g. NUL injection)
96             if(c32 <= 0x7F)
97                 FINISH(false);
98
99             OUT32(c32);
100             break;
101
102         case Dec3B2:
103             CHECK_CONTINUATION;
104             accum |= (c & 0x3F) << 6;
105             state = Dec3B3;
106             break;
107
108         case Dec3B3:
109             CHECK_CONTINUATION;
110
111             c32 = accum | (c & 0x3F);
112
113             // Overlong
114             if(c32 <= 0x07FF)
115                 FINISH(false);
116
117             // Surrogate
118             if(c32 >= 0xD800 && c32 <= 0xDFFF)
119                 FINISH(false);
120
121             OUT32(c32);
122             break;
123
124         case Dec4B2:
125             CHECK_CONTINUATION;
126             accum |= (c & 0x3F) << 12;
127             state = Dec4B3;
128             break;
129
130         case Dec4B3:
131             CHECK_CONTINUATION;
132             accum |= (c & 0x3F) << 6;
133             state = Dec4B4;
134             break;
135
136         case Dec4B4:
137             CHECK_CONTINUATION;
138
139             c32 = accum | (c & 0x3F);
140
141             // Overlong
142             if(c32 <= 0xFFFF) FINISH(false);
143
144             // Not in Unicode
145             if(c32 > 0x10FFFF) FINISH(false);
146
147             OUT32(c32);
148             break;
149
150         default:
151             assert(!"Invalid state");
152         }
153
154         (*p_inbuf)++;
155         (*p_insz)--; 
156     }
157     END_CONVERSION;
158 }
159
160 enum {
161     EncStart = 0,
162     Enc1R,
163     Enc2R,
164     Enc3R,
165 };
166
167 static bool c32toutf8(
168     char           **restrict  p_outbuf,
169     size_t          *restrict  p_outsz,
170     const char32_t **restrict  p_inbuf,
171     size_t          *restrict  p_insz,
172     mbstate_t       *restrict  p_s
173 )
174 {
175     START_CONVERSION
176     while(*p_outsz) {
177         unsigned char outc;
178         switch(state) {
179         case Enc3R:
180             outc = 0x80 | ((accum >> 12) & 0x3F);
181             state = Enc2R;
182             break;
183
184         case Enc2R:
185             outc = 0x80 | ((accum >> 6) & 0x3F);
186             state = Enc1R;
187             break;
188
189         case Enc1R:
190             outc = 0x80 | (accum & 0x3F);
191             state = EncStart;
192             _PDCLIB_UNDEFINED(accum);
193             break;
194
195         case EncStart:
196             if(*p_insz == 0)
197                 FINISH(true);
198
199             accum  = **p_inbuf;
200             (*p_inbuf)++;
201             (*p_insz)--;
202
203             if(accum <= 0x7F) {
204                 outc = accum;
205                 state = EncStart;
206                 _PDCLIB_UNDEFINED(accum);
207             } else if(accum <= 0x7FF) {
208                 outc = 0xC0 | (accum >> 6);
209                 state = Enc1R;
210             } else if(accum <= 0xFFFF) {
211                 outc = 0xE0 | (accum >> 12);
212                 state = Enc2R;
213             } else if(accum <= 0x10FFFF) {
214                 outc = 0xF0 | (accum >> 18);
215                 state = Enc3R;
216             } else {
217                 FINISH(false);
218             }
219             break;
220         }
221
222         if(p_outbuf) {
223             **p_outbuf = outc;
224             (*p_outbuf)++; 
225         }
226         (*p_outsz)--;        
227     }
228     END_CONVERSION;
229 }
230
231 _PDCLIB_charcodec_t _PDCLIB_utf8_codec = {
232     .__mbstoc32s = utf8toc32,
233     .__c32stombs = c32toutf8,
234 };
235
236 #endif
237
238 #ifdef TEST
239 #include <_PDCLIB_test.h>
240
241 int main( void )
242 {
243 #ifndef REGTEST
244     // Valid conversion & back
245
246     static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF" 
247                                "\xF4\x8F\xBF\xBF";
248
249     char32_t c32out[8];
250
251     char32_t *c32ptr = &c32out[0];
252     size_t    c32rem = 8;
253     char     *chrptr = (char*) &input[0];
254     size_t    chrrem = strlen(input);
255     mbstate_t mbs = { 0 };
256
257     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
258     TESTCASE(c32rem == 0);
259     TESTCASE(chrrem == 0);
260     TESTCASE(c32ptr == &c32out[8]);
261     TESTCASE(chrptr == &input[strlen(input)]);
262     TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
263              c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
264              c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
265
266     char chrout[strlen(input)];
267     c32ptr = &c32out[0];
268     c32rem = 8;
269     chrptr = &chrout[0];
270     chrrem = strlen(input);
271     TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
272     TESTCASE(c32rem == 0);
273     TESTCASE(chrrem == 0);
274     TESTCASE(c32ptr == &c32out[8]);
275     TESTCASE(chrptr == &chrout[strlen(input)]);
276     TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
277
278     // Multi-part conversion
279     static const char* mpinput = "\xDF\xBF";
280     c32ptr = &c32out[0];
281     c32rem = 8;
282     chrptr = &mpinput[0];
283     chrrem = 1;
284     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
285     TESTCASE(c32ptr == &c32out[0]);
286     TESTCASE(c32rem == 8);
287     TESTCASE(chrptr == &mpinput[1]);
288     TESTCASE(chrrem == 0);
289     chrrem = 1;
290     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
291     TESTCASE(c32ptr == &c32out[1]);
292     TESTCASE(c32rem == 7);
293     TESTCASE(chrptr == &mpinput[2]);
294     TESTCASE(chrrem == 0);
295
296     // Invalid conversions
297
298     // Overlong nuls
299     const char* nul2 = "\xC0\x80";
300     c32ptr = &c32out[0];
301     c32rem = 8;
302     chrptr = &nul2[0];
303     chrrem = 2;
304     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
305     memset(&mbs, 0, sizeof mbs);
306     const char* nul3 = "\xE0\x80\x80";
307     c32ptr = &c32out[0];
308     c32rem = 8;
309     chrptr = &nul3[0];
310     chrrem = 3;
311     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
312     memset(&mbs, 0, sizeof mbs);
313     const char* nul4 = "\xF0\x80\x80\x80";
314     c32ptr = &c32out[0];
315     c32rem = 8;
316     chrptr = &nul4[0];
317     chrrem = 4;
318     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
319
320     // Starting on a continuation
321     const char* cont = "\x80";
322     c32ptr = &c32out[0];
323     c32rem = 8;
324     chrptr = &cont[0];
325     chrrem = 1;
326     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
327 #endif
328     return TEST_RESULTS;
329 }
330
331 #endif
332