]> pd.if.org Git - pdclib.old/blob - opt/basecodecs/_PDCLIB_utf8.c
a88f964077457c04f4d3cdba420b86747b82bc2e
[pdclib.old] / opt / basecodecs / _PDCLIB_utf8.c
1 /* UTF-8 codec
2
3    This file is part of the Public Domain C Library (PDCLib).
4    Permission is granted to use, modify, and / or redistribute at will.
5 */
6
7 #ifndef REGTEST
8 #include <stdbool.h>
9 #include <stdint.h>
10 #include <uchar.h>
11 #include <assert.h>
12
13 /* Use of the mbstate:
14  *
15  * _StUC[0] is the current decoding state
16  * _St32[1] is the character accumulated so far
17  */
18
19 enum {
20     DecStart = 0,
21
22     Dec2B2,
23
24     Dec3B2,
25     Dec3B3,
26
27     Dec4B2,
28     Dec4B3,
29     Dec4B4
30 };
31
32 #define state (p_s->_StUC[0])
33 #define accum (p_s->_St32[1])
34
35 #define START_CONVERSION \
36     bool          result = true;           \
37
38 #define END_CONVERSION      \
39 end_conversion:             \
40     return result
41
42 #define FINISH(_r) do {     \
43     result = (_r);          \
44     goto end_conversion;    \
45 } while(0)
46
47 #define OUT32(_c)  do {         \
48     (*((*p_outbuf)++)) = (_c);  \
49     (*p_outsz)--;               \
50     _PDCLIB_UNDEFINED(accum);   \
51     state = DecStart;           \
52 } while(0)
53 #define CHECK_CONTINUATION \
54     do { if((c & 0xC0) != 0x80) return false; } while(0)
55
56 static bool utf8toc32(
57     char32_t       **restrict   p_outbuf,
58     size_t          *restrict   p_outsz,
59     const char     **restrict   p_inbuf,
60     size_t          *restrict   p_insz,
61     mbstate_t       *restrict   p_s
62 )
63 {
64     START_CONVERSION
65     while(*p_outsz && *p_insz) {
66         unsigned char c = **p_inbuf;
67         char32_t      c32;
68         switch(state) {
69         case DecStart:
70             // 1 byte 
71             if(c <= 0x7F) {
72                 OUT32(c);
73             } else if(c <= 0xDF) {
74                 accum = (c & 0x1F) << 6;
75                 state = Dec2B2;
76             } else if(c <= 0xEF) {
77                 accum = (c & 0x0F) << 12;
78                 state = Dec3B2;
79             } else if(c <= 0xF4) {
80                 accum = (c & 0x07) << 18;
81                 state = Dec4B2;
82             } else {
83                 // 5+byte sequence illegal
84                 FINISH(false);
85             }
86             break;
87
88         case Dec2B2:
89             CHECK_CONTINUATION;
90
91             c32 = accum | (c & 0x3F);
92
93             // Overlong sequence (e.g. NUL injection)
94             if(c32 <= 0x7F)
95                 FINISH(false);
96
97             OUT32(c32);
98             break;
99
100         case Dec3B2:
101             CHECK_CONTINUATION;
102             accum |= (c & 0x3F) << 6;
103             state = Dec3B3;
104             break;
105
106         case Dec3B3:
107             CHECK_CONTINUATION;
108
109             c32 = accum | (c & 0x3F);
110
111             // Overlong
112             if(c32 <= 0x07FF)
113                 FINISH(false);
114
115             // Surrogate
116             if(c32 >= 0xD800 && c32 <= 0xDFFF)
117                 FINISH(false);
118
119             OUT32(c32);
120             break;
121
122         case Dec4B2:
123             CHECK_CONTINUATION;
124             accum |= (c & 0x3F) << 12;
125             state = Dec4B3;
126             break;
127
128         case Dec4B3:
129             CHECK_CONTINUATION;
130             accum |= (c & 0x3F) << 6;
131             state = Dec4B4;
132             break;
133
134         case Dec4B4:
135             CHECK_CONTINUATION;
136
137             c32 = accum | (c & 0x3F);
138
139             // Overlong
140             if(c32 <= 0xFFFF) FINISH(false);
141
142             // Not in Unicode
143             if(c32 > 0x10FFFF) FINISH(false);
144
145             OUT32(c32);
146             break;
147
148         default:
149             assert(!"Invalid state");
150         }
151
152         (*p_inbuf)++;
153         (*p_insz)--; 
154     }
155     END_CONVERSION;
156 }
157
158 enum {
159     EncStart = 0,
160     Enc1R,
161     Enc2R,
162     Enc3R,
163 };
164
165 static bool c32toutf8(
166     char           **restrict  p_outbuf,
167     size_t          *restrict  p_outsz,
168     const char32_t **restrict  p_inbuf,
169     size_t          *restrict  p_insz,
170     mbstate_t       *restrict  p_s
171 )
172 {
173     START_CONVERSION
174     while(*p_outsz) {
175         char     *c8 =  *p_outbuf;
176         switch(state) {
177         case Enc3R:
178             *c8 = 0x80 | ((accum >> 12) & 0x3F);
179             state = Enc2R;
180             break;
181
182         case Enc2R:
183             *c8 = 0x80 | ((accum >> 6) & 0x3F);
184             state = Enc1R;
185             break;
186
187         case Enc1R:
188             *c8 = 0x80 | (accum & 0x3F);
189             state = EncStart;
190             _PDCLIB_UNDEFINED(accum);
191             break;
192
193         case EncStart:
194             if(*p_insz == 0)
195                 FINISH(true);
196
197             accum  = **p_inbuf;
198             (*p_inbuf)++;
199             (*p_insz)--;
200
201             if(accum <= 0x7F) {
202                 *c8 = accum;
203                 state = EncStart;
204                 _PDCLIB_UNDEFINED(accum);
205             } else if(accum <= 0x7FF) {
206                 *c8 = 0xC0 | (accum >> 6);
207                 state = Enc1R;
208             } else if(accum <= 0xFFFF) {
209                 *c8 = 0xE0 | (accum >> 12);
210                 state = Enc2R;
211             } else if(accum <= 0x10FFFF) {
212                 *c8 = 0xF0 | (accum >> 18);
213                 state = Enc3R;
214             } else {
215                 FINISH(false);
216             }
217             break;
218         }
219
220
221         (*p_outbuf)++; 
222         (*p_outsz)--;        
223     }
224     END_CONVERSION;
225 }
226 #endif
227
228 #ifdef TEST
229 #include <_PDCLIB_test.h>
230
231 int main( void )
232 {
233 #ifndef REGTEST
234     // Valid conversion & back
235
236     static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF" 
237                                "\xF4\x8F\xBF\xBF";
238
239     char32_t c32out[8];
240
241     char32_t *c32ptr = &c32out[0];
242     size_t    c32rem = 8;
243     char     *chrptr = (char*) &input[0];
244     size_t    chrrem = strlen(input);
245     mbstate_t mbs = { 0 };
246
247     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
248     TESTCASE(c32rem == 0);
249     TESTCASE(chrrem == 0);
250     TESTCASE(c32ptr == &c32out[8]);
251     TESTCASE(chrptr == &input[strlen(input)]);
252     TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
253              c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
254              c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
255
256     char chrout[strlen(input)];
257     c32ptr = &c32out[0];
258     c32rem = 8;
259     chrptr = &chrout[0];
260     chrrem = strlen(input);
261     TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
262     TESTCASE(c32rem == 0);
263     TESTCASE(chrrem == 0);
264     TESTCASE(c32ptr == &c32out[8]);
265     TESTCASE(chrptr == &chrout[strlen(input)]);
266     TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
267
268     // Multi-part conversion
269     static const char* mpinput = "\xDF\xBF";
270     c32ptr = &c32out[0];
271     c32rem = 8;
272     chrptr = &mpinput[0];
273     chrrem = 1;
274     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
275     TESTCASE(c32ptr == &c32out[0]);
276     TESTCASE(c32rem == 8);
277     TESTCASE(chrptr == &mpinput[1]);
278     TESTCASE(chrrem == 0);
279     chrrem = 1;
280     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
281     TESTCASE(c32ptr == &c32out[1]);
282     TESTCASE(c32rem == 7);
283     TESTCASE(chrptr == &mpinput[2]);
284     TESTCASE(chrrem == 0);
285
286     // Invalid conversions
287
288     // Overlong nuls
289     const char* nul2 = "\xC0\x80";
290     c32ptr = &c32out[0];
291     c32rem = 8;
292     chrptr = &nul2[0];
293     chrrem = 2;
294     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
295     memset(&mbs, 0, sizeof mbs);
296     const char* nul3 = "\xE0\x80\x80";
297     c32ptr = &c32out[0];
298     c32rem = 8;
299     chrptr = &nul3[0];
300     chrrem = 3;
301     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
302     memset(&mbs, 0, sizeof mbs);
303     const char* nul4 = "\xF0\x80\x80\x80";
304     c32ptr = &c32out[0];
305     c32rem = 8;
306     chrptr = &nul4[0];
307     chrrem = 4;
308     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
309
310     // Starting on a continuation
311     const char* cont = "\x80";
312     c32ptr = &c32out[0];
313     c32rem = 8;
314     chrptr = &cont[0];
315     chrrem = 1;
316     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
317 #endif
318     return TEST_RESULTS;
319 }
320
321 #endif
322