]> pd.if.org Git - pdclib/blob - opt/basecodecs/_PDCLIB_utf8.c
PDCLIB-3 Add XDIGIT to list of bits in Unicode character data
[pdclib] / opt / basecodecs / _PDCLIB_utf8.c
1 /* UTF-8 codec
2
3    This file is part of the Public Domain C Library (PDCLib).
4    Permission is granted to use, modify, and / or redistribute at will.
5 */
6
7 #ifndef REGTEST
8 #include <stdbool.h>
9 #include <stdint.h>
10 #include <uchar.h>
11 #include <assert.h>
12 #include <_PDCLIB_encoding.h>
13
14 /* Use of the mbstate:
15  *
16  * _StUC[0] is the current decoding state
17  * _St32[1] is the character accumulated so far
18  */
19
20 enum {
21     DecStart = 0,
22
23     Dec2B2,
24
25     Dec3B2,
26     Dec3B3,
27
28     Dec4B2,
29     Dec4B3,
30     Dec4B4
31 };
32
33 #define state (p_s->_StUC[0])
34 #define accum (p_s->_St32[1])
35
36 #define START_CONVERSION \
37     bool          result = true;           \
38
39 #define END_CONVERSION      \
40 end_conversion:             \
41     return result
42
43 #define FINISH(_r) do {     \
44     result = (_r);          \
45     goto end_conversion;    \
46 } while(0)
47
48 #define OUT32(_c)  do {             \
49     if(p_outbuf)                    \
50         (*((*p_outbuf)++)) = (_c);  \
51     (*p_outsz)--;                   \
52     _PDCLIB_UNDEFINED(accum);       \
53     state = DecStart;               \
54 } while(0)
55 #define CHECK_CONTINUATION \
56     do { if((c & 0xC0) != 0x80) return false; } while(0)
57
58 static bool utf8toc32(
59     char32_t       *restrict *restrict   p_outbuf,
60     size_t                   *restrict   p_outsz,
61     const char     *restrict *restrict   p_inbuf,
62     size_t                   *restrict   p_insz,
63     mbstate_t                *restrict   p_s
64 )
65 {
66     START_CONVERSION
67     while(*p_outsz && *p_insz) {
68         unsigned char c = **p_inbuf;
69         char32_t      c32;
70         switch(state) {
71         case DecStart:
72             // 1 byte 
73             if(c <= 0x7F) {
74                 OUT32(c);
75             } else if(c <= 0xDF) {
76                 accum = (c & 0x1F) << 6;
77                 state = Dec2B2;
78             } else if(c <= 0xEF) {
79                 accum = (c & 0x0F) << 12;
80                 state = Dec3B2;
81             } else if(c <= 0xF4) {
82                 accum = (c & 0x07) << 18;
83                 state = Dec4B2;
84             } else {
85                 // 5+byte sequence illegal
86                 FINISH(false);
87             }
88             break;
89
90         case Dec2B2:
91             CHECK_CONTINUATION;
92
93             c32 = accum | (c & 0x3F);
94
95             // Overlong sequence (e.g. NUL injection)
96             if(c32 <= 0x7F)
97                 FINISH(false);
98
99             OUT32(c32);
100             break;
101
102         case Dec3B2:
103             CHECK_CONTINUATION;
104             accum |= (c & 0x3F) << 6;
105             state = Dec3B3;
106             break;
107
108         case Dec3B3:
109             CHECK_CONTINUATION;
110
111             c32 = accum | (c & 0x3F);
112
113             // Overlong
114             if(c32 <= 0x07FF)
115                 FINISH(false);
116
117             // Surrogate
118             if(c32 >= 0xD800 && c32 <= 0xDFFF)
119                 FINISH(false);
120
121             OUT32(c32);
122             break;
123
124         case Dec4B2:
125             CHECK_CONTINUATION;
126             accum |= (c & 0x3F) << 12;
127             state = Dec4B3;
128             break;
129
130         case Dec4B3:
131             CHECK_CONTINUATION;
132             accum |= (c & 0x3F) << 6;
133             state = Dec4B4;
134             break;
135
136         case Dec4B4:
137             CHECK_CONTINUATION;
138
139             c32 = accum | (c & 0x3F);
140
141             // Overlong
142             if(c32 <= 0xFFFF) FINISH(false);
143
144             // Not in Unicode
145             if(c32 > 0x10FFFF) FINISH(false);
146
147             OUT32(c32);
148             break;
149
150         default:
151             assert(!"Invalid state");
152         }
153
154         (*p_inbuf)++;
155         (*p_insz)--; 
156     }
157     END_CONVERSION;
158 }
159
160 enum {
161     EncStart = 0,
162     Enc1R,
163     Enc2R,
164     Enc3R,
165 };
166
167 static bool c32toutf8(
168     char           *restrict *restrict  p_outbuf,
169     size_t                   *restrict  p_outsz,
170     const char32_t *restrict *restrict  p_inbuf,
171     size_t                   *restrict  p_insz,
172     mbstate_t                *restrict  p_s
173 )
174 {
175     START_CONVERSION
176     while(*p_outsz) {
177         unsigned char outc;
178         switch(state) {
179         case Enc3R:
180             outc = 0x80 | ((accum >> 12) & 0x3F);
181             state = Enc2R;
182             break;
183
184         case Enc2R:
185             outc = 0x80 | ((accum >> 6) & 0x3F);
186             state = Enc1R;
187             break;
188
189         case Enc1R:
190             outc = 0x80 | (accum & 0x3F);
191             state = EncStart;
192             _PDCLIB_UNDEFINED(accum);
193             break;
194
195         case EncStart:
196             if(*p_insz == 0)
197                 FINISH(true);
198
199             accum  = **p_inbuf;
200             (*p_inbuf)++;
201             (*p_insz)--;
202
203             if(accum <= 0x7F) {
204                 outc = accum;
205                 state = EncStart;
206                 _PDCLIB_UNDEFINED(accum);
207             } else if(accum <= 0x7FF) {
208                 outc = 0xC0 | (accum >> 6);
209                 state = Enc1R;
210             } else if(accum <= 0xFFFF) {
211                 outc = 0xE0 | (accum >> 12);
212                 state = Enc2R;
213             } else if(accum <= 0x10FFFF) {
214                 outc = 0xF0 | (accum >> 18);
215                 state = Enc3R;
216             } else {
217                 FINISH(false);
218             }
219             break;
220         }
221
222         if(p_outbuf) {
223             **p_outbuf = outc;
224             (*p_outbuf)++; 
225         }
226         (*p_outsz)--;        
227     }
228     END_CONVERSION;
229 }
230
231 struct _PDCLIB_charcodec _PDCLIB_utf8_codec = {
232     .__mbstoc32s = utf8toc32,
233     .__c32stombs = c32toutf8,
234     .__mb_max    = 4,
235 };
236
237 #endif
238
239 #ifdef TEST
240 #include <_PDCLIB_test.h>
241
242 int main( void )
243 {
244 #ifndef REGTEST
245     // Valid conversion & back
246
247     static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF" 
248                                "\xF4\x8F\xBF\xBF";
249
250     char32_t c32out[8];
251
252     char32_t *c32ptr = &c32out[0];
253     size_t    c32rem = 8;
254     char     *chrptr = (char*) &input[0];
255     size_t    chrrem = strlen(input);
256     mbstate_t mbs = { 0 };
257
258     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
259     TESTCASE(c32rem == 0);
260     TESTCASE(chrrem == 0);
261     TESTCASE(c32ptr == &c32out[8]);
262     TESTCASE(chrptr == &input[strlen(input)]);
263     TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
264              c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
265              c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
266
267     char chrout[strlen(input)];
268     c32ptr = &c32out[0];
269     c32rem = 8;
270     chrptr = &chrout[0];
271     chrrem = strlen(input);
272     TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
273     TESTCASE(c32rem == 0);
274     TESTCASE(chrrem == 0);
275     TESTCASE(c32ptr == &c32out[8]);
276     TESTCASE(chrptr == &chrout[strlen(input)]);
277     TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
278
279     // Multi-part conversion
280     static const char* mpinput = "\xDF\xBF";
281     c32ptr = &c32out[0];
282     c32rem = 8;
283     chrptr = &mpinput[0];
284     chrrem = 1;
285     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
286     TESTCASE(c32ptr == &c32out[0]);
287     TESTCASE(c32rem == 8);
288     TESTCASE(chrptr == &mpinput[1]);
289     TESTCASE(chrrem == 0);
290     chrrem = 1;
291     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
292     TESTCASE(c32ptr == &c32out[1]);
293     TESTCASE(c32rem == 7);
294     TESTCASE(chrptr == &mpinput[2]);
295     TESTCASE(chrrem == 0);
296
297     // Invalid conversions
298
299     // Overlong nuls
300     const char* nul2 = "\xC0\x80";
301     c32ptr = &c32out[0];
302     c32rem = 8;
303     chrptr = &nul2[0];
304     chrrem = 2;
305     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
306     memset(&mbs, 0, sizeof mbs);
307     const char* nul3 = "\xE0\x80\x80";
308     c32ptr = &c32out[0];
309     c32rem = 8;
310     chrptr = &nul3[0];
311     chrrem = 3;
312     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
313     memset(&mbs, 0, sizeof mbs);
314     const char* nul4 = "\xF0\x80\x80\x80";
315     c32ptr = &c32out[0];
316     c32rem = 8;
317     chrptr = &nul4[0];
318     chrrem = 4;
319     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
320
321     // Starting on a continuation
322     const char* cont = "\x80";
323     c32ptr = &c32out[0];
324     c32rem = 8;
325     chrptr = &cont[0];
326     chrrem = 1;
327     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
328 #endif
329     return TEST_RESULTS;
330 }
331
332 #endif
333