]> pd.if.org Git - pdclib/blob - opt/basecodecs/_PDCLIB_utf8.c
a4c19adb349a735c6715af6c66641eab61de66b3
[pdclib] / opt / basecodecs / _PDCLIB_utf8.c
1 /* UTF-8 codec
2
3    This file is part of the Public Domain C Library (PDCLib).
4    Permission is granted to use, modify, and / or redistribute at will.
5 */
6
7 #ifndef REGTEST
8 #include <stdbool.h>
9 #include <stdint.h>
10 #include <uchar.h>
11 #include <assert.h>
12 #include "_PDCLIB_encoding.h"
13
14 /* Use of the mbstate:
15  *
16  * _StUC[0] is the current decoding state
17  * _St32[1] is the character accumulated so far
18  */
19
20 static bool utf8_mbsinit( const mbstate_t *p_s )
21 { return p_s->_StUC[0] == 0; }
22
23 enum {
24     DecStart = 0,
25
26     Dec2B2,
27
28     Dec3B2,
29     Dec3B3,
30
31     Dec4B2,
32     Dec4B3,
33     Dec4B4
34 };
35
36 #define state (p_s->_StUC[0])
37 #define accum (p_s->_St32[1])
38
39 #define START_CONVERSION \
40     bool          result = true;           \
41
42 #define END_CONVERSION      \
43 end_conversion:             \
44     return result
45
46 #define FINISH(_r) do {     \
47     result = (_r);          \
48     goto end_conversion;    \
49 } while(0)
50
51 #define OUT32(_c)  do {             \
52     if(p_outbuf)                    \
53         (*((*p_outbuf)++)) = (_c);  \
54     (*p_outsz)--;                   \
55     _PDCLIB_UNDEFINED(accum);       \
56     state = DecStart;               \
57 } while(0)
58
59 #define CHECK_CONTINUATION \
60     do { if((c & 0xC0) != 0x80) return false; } while(0)
61
62 static bool utf8toc32(
63     char32_t       *restrict *restrict   p_outbuf,
64     size_t                   *restrict   p_outsz,
65     const char     *restrict *restrict   p_inbuf,
66     size_t                   *restrict   p_insz,
67     mbstate_t                *restrict   p_s
68 )
69 {
70     START_CONVERSION
71     while(*p_outsz && *p_insz) {
72         unsigned char c = **p_inbuf;
73         char32_t      c32;
74         switch(state) {
75         case DecStart:
76             // 1 byte
77             if(c <= 0x7F) {
78                 OUT32(c);
79             } else if(c <= 0xDF) {
80                 accum = (c & 0x1F) << 6;
81                 state = Dec2B2;
82             } else if(c <= 0xEF) {
83                 accum = (c & 0x0F) << 12;
84                 state = Dec3B2;
85             } else if(c <= 0xF4) {
86                 accum = (c & 0x07) << 18;
87                 state = Dec4B2;
88             } else {
89                 // 5+byte sequence illegal
90                 FINISH(false);
91             }
92             break;
93
94         case Dec2B2:
95             CHECK_CONTINUATION;
96
97             c32 = accum | (c & 0x3F);
98
99             // Overlong sequence (e.g. NUL injection)
100             if(c32 <= 0x7F)
101                 FINISH(false);
102
103             OUT32(c32);
104             break;
105
106         case Dec3B2:
107             CHECK_CONTINUATION;
108             accum |= (c & 0x3F) << 6;
109             state = Dec3B3;
110             break;
111
112         case Dec3B3:
113             CHECK_CONTINUATION;
114
115             c32 = accum | (c & 0x3F);
116
117             // Overlong
118             if(c32 <= 0x07FF)
119                 FINISH(false);
120
121             // Surrogate
122             if(c32 >= 0xD800 && c32 <= 0xDFFF)
123                 FINISH(false);
124
125             OUT32(c32);
126             break;
127
128         case Dec4B2:
129             CHECK_CONTINUATION;
130             accum |= (c & 0x3F) << 12;
131             state = Dec4B3;
132             break;
133
134         case Dec4B3:
135             CHECK_CONTINUATION;
136             accum |= (c & 0x3F) << 6;
137             state = Dec4B4;
138             break;
139
140         case Dec4B4:
141             CHECK_CONTINUATION;
142
143             c32 = accum | (c & 0x3F);
144
145             // Overlong
146             if(c32 <= 0xFFFF) FINISH(false);
147
148             // Not in Unicode
149             if(c32 > 0x10FFFF) FINISH(false);
150
151             OUT32(c32);
152             break;
153
154         default:
155             assert(!"Invalid state");
156         }
157
158         (*p_inbuf)++;
159         (*p_insz)--;
160     }
161     END_CONVERSION;
162 }
163
164 enum {
165     EncStart = 0,
166     Enc1R,
167     Enc2R,
168     Enc3R,
169 };
170
171 static bool c32toutf8(
172     char           *restrict *restrict  p_outbuf,
173     size_t                   *restrict  p_outsz,
174     const char32_t *restrict *restrict  p_inbuf,
175     size_t                   *restrict  p_insz,
176     mbstate_t                *restrict  p_s
177 )
178 {
179     START_CONVERSION
180     while(*p_outsz) {
181         unsigned char outc = 0;
182         switch(state) {
183         case Enc3R:
184             outc = 0x80 | ((accum >> 12) & 0x3F);
185             state = Enc2R;
186             break;
187
188         case Enc2R:
189             outc = 0x80 | ((accum >> 6) & 0x3F);
190             state = Enc1R;
191             break;
192
193         case Enc1R:
194             outc = 0x80 | (accum & 0x3F);
195             state = EncStart;
196             _PDCLIB_UNDEFINED(accum);
197             break;
198
199         case EncStart:
200             if(*p_insz == 0)
201                 FINISH(true);
202
203             accum  = **p_inbuf;
204             (*p_inbuf)++;
205             (*p_insz)--;
206
207             if(accum <= 0x7F) {
208                 outc = accum;
209                 state = EncStart;
210                 _PDCLIB_UNDEFINED(accum);
211             } else if(accum <= 0x7FF) {
212                 outc = 0xC0 | (accum >> 6);
213                 state = Enc1R;
214             } else if(accum <= 0xFFFF) {
215                 outc = 0xE0 | (accum >> 12);
216                 state = Enc2R;
217             } else if(accum <= 0x10FFFF) {
218                 outc = 0xF0 | (accum >> 18);
219                 state = Enc3R;
220             } else {
221                 FINISH(false);
222             }
223             break;
224         }
225
226         if(p_outbuf) {
227             **p_outbuf = outc;
228             (*p_outbuf)++;
229         }
230         (*p_outsz)--;
231     }
232     END_CONVERSION;
233 }
234
235 const struct _PDCLIB_charcodec_t _PDCLIB_utf8_codec = {
236     .__mbsinit   = utf8_mbsinit,
237     .__mbstoc32s = utf8toc32,
238     .__c32stombs = c32toutf8,
239     .__mb_max    = 4,
240 };
241
242 #endif
243
244 #ifdef TEST
245 #include "_PDCLIB_test.h"
246
247 int main( void )
248 {
249 #ifndef REGTEST
250     // Valid conversion & back
251
252     static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF"
253                                "\xF4\x8F\xBF\xBF";
254
255     char32_t c32out[8];
256
257     char32_t   *c32ptr = &c32out[0];
258     size_t      c32rem = 8;
259     const char *chrptr = (char*) &input[0];
260     size_t      chrrem = strlen(input);
261     mbstate_t   mbs = { 0 };
262
263     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
264     TESTCASE(c32rem == 0);
265     TESTCASE(chrrem == 0);
266     TESTCASE(c32ptr == &c32out[8]);
267     TESTCASE(chrptr == &input[strlen(input)]);
268     TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
269              c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
270              c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
271
272     char chrout[strlen(input)];
273     c32ptr = &c32out[0];
274     c32rem = 8;
275     chrptr = &chrout[0];
276     chrrem = strlen(input);
277     TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
278     TESTCASE(c32rem == 0);
279     TESTCASE(chrrem == 0);
280     TESTCASE(c32ptr == &c32out[8]);
281     TESTCASE(chrptr == &chrout[strlen(input)]);
282     TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
283
284     // Multi-part conversion
285     static const char* mpinput = "\xDF\xBF";
286     c32ptr = &c32out[0];
287     c32rem = 8;
288     chrptr = &mpinput[0];
289     chrrem = 1;
290     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
291     TESTCASE(c32ptr == &c32out[0]);
292     TESTCASE(c32rem == 8);
293     TESTCASE(chrptr == &mpinput[1]);
294     TESTCASE(chrrem == 0);
295     chrrem = 1;
296     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
297     TESTCASE(c32ptr == &c32out[1]);
298     TESTCASE(c32rem == 7);
299     TESTCASE(chrptr == &mpinput[2]);
300     TESTCASE(chrrem == 0);
301
302     // Invalid conversions
303
304     // Overlong nuls
305     const char* nul2 = "\xC0\x80";
306     c32ptr = &c32out[0];
307     c32rem = 8;
308     chrptr = &nul2[0];
309     chrrem = 2;
310     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
311     memset(&mbs, 0, sizeof mbs);
312     const char* nul3 = "\xE0\x80\x80";
313     c32ptr = &c32out[0];
314     c32rem = 8;
315     chrptr = &nul3[0];
316     chrrem = 3;
317     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
318     memset(&mbs, 0, sizeof mbs);
319     const char* nul4 = "\xF0\x80\x80\x80";
320     c32ptr = &c32out[0];
321     c32rem = 8;
322     chrptr = &nul4[0];
323     chrrem = 4;
324     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
325
326     // Starting on a continuation
327     const char* cont = "\x80";
328     c32ptr = &c32out[0];
329     c32rem = 8;
330     chrptr = &cont[0];
331     chrrem = 1;
332     TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
333 #endif
334     return TEST_RESULTS;
335 }
336
337 #endif
338