From 4fe88a1c445105a96d08b808831d6fc7480a211d Mon Sep 17 00:00:00 2001 From: Owen Shepherd Date: Mon, 31 Dec 2012 01:38:06 +0000 Subject: [PATCH] PDCLIB-2 PDCLIB-12: If the internal character conversion routines are called with p_outbuf == NULL, then they will perform the conversion but *not* store the result. This facilitates the implementation of mbrlen and similar functions. Added (optional) UTF-16 variants of the conversion routines. If provided them, then PDCLib will use them whenever it needs a UTF-16 conversion (e.g. uchar.h UTF-16 routines, or wchar_t on win32) --- internals/_PDCLIB_encoding.h | 38 +++++++++++++++++++++++++++----- opt/basecodecs/_PDCLIB_ascii.c | 20 +++++++++++++---- opt/basecodecs/_PDCLIB_latin1.c | 20 +++++++++++++---- opt/basecodecs/_PDCLIB_utf8.c | 39 ++++++++++++++++++++------------- 4 files changed, 89 insertions(+), 28 deletions(-) diff --git a/internals/_PDCLIB_encoding.h b/internals/_PDCLIB_encoding.h index 01e60a1..0178b12 100644 --- a/internals/_PDCLIB_encoding.h +++ b/internals/_PDCLIB_encoding.h @@ -147,15 +147,21 @@ static inline _PDCLIB_size_t _PDCLIB_c32rtowc( #endif typedef struct { - /* Reads at most *_P_insz bytes from *_P_inbuf and writes the result into - * *_P_outbuf, writing at most *_P_outsz characters. Updates *_P_outbuf, - * *_P_outsz, *_P_inbuf, *_P_outsz with the resulting state + /* Reads at most *_P_insz code units from *_P_inbuf and writes the result + * into *_P_outbuf, writing at most *_P_outsz code units. Updates + * *_P_outbuf, *_P_outsz, *_P_inbuf, *_P_outsz with the resulting state + * + * If _P_outbuf is NULL, then the input must be processed but no output + * generated. _P_outsz may be processed as normal. * * Returns true if the conversion completed successfully (i.e. one of * _P_outsize or _P_insize reached zero and no coding errors were * encountered), else return false. */ - _PDCLIB_bool (*__mbtoc32)( + + /* UCS-4 variants. Mandatory. */ + + _PDCLIB_bool (*__mbstoc32s)( _PDCLIB_char32_t **_PDCLIB_restrict _P_outbuf, _PDCLIB_size_t *_PDCLIB_restrict _P_outsz, const char **_PDCLIB_restrict _P_inbuf, @@ -163,13 +169,35 @@ typedef struct { _PDCLIB_mbstate_t *_PDCLIB_restrict _P_ps ); - _PDCLIB_bool (*__c32tomb)( + _PDCLIB_bool (*__c32stombs)( char **_PDCLIB_restrict _P_outbuf, _PDCLIB_size_t *_PDCLIB_restrict _P_outsz, const _PDCLIB_char32_t **_PDCLIB_restrict _P_inbuf, _PDCLIB_size_t *_PDCLIB_restrict _P_insz, _PDCLIB_mbstate_t *_PDCLIB_restrict _P_ps ); + + /* UTF-16 variants; same as above except optional. + * + * If not provided, _PDCLib will internally synthesize on top of the UCS-4 + * variants above, albeit at a performance cost. + */ + + _PDCLIB_bool (*__mbstoc16s)( + _PDCLIB_char16_t **_PDCLIB_restrict _P_outbuf, + _PDCLIB_size_t *_PDCLIB_restrict _P_outsz, + const char **_PDCLIB_restrict _P_inbuf, + _PDCLIB_size_t *_PDCLIB_restrict _P_insz, + _PDCLIB_mbstate_t *_PDCLIB_restrict _P_ps + ); + + _PDCLIB_bool (*__c16stombs)( + char **_PDCLIB_restrict _P_outbuf, + _PDCLIB_size_t *_PDCLIB_restrict _P_outsz, + const _PDCLIB_char16_t **_PDCLIB_restrict _P_inbuf, + _PDCLIB_size_t *_PDCLIB_restrict _P_insz, + _PDCLIB_mbstate_t *_PDCLIB_restrict _P_ps + ); } _PDCLIB_charcodec; #endif diff --git a/opt/basecodecs/_PDCLIB_ascii.c b/opt/basecodecs/_PDCLIB_ascii.c index ca70a66..a705a7a 100644 --- a/opt/basecodecs/_PDCLIB_ascii.c +++ b/opt/basecodecs/_PDCLIB_ascii.c @@ -20,10 +20,13 @@ static bool asciitoc32( unsigned char c = **p_inbuf; if(c > 127) return false; - **p_outbuf = c; + + if(p_outbuf) { + **p_outbuf = c; + (*p_outbuf)++; + } (*p_inbuf)++; - (*p_outbuf)++; (*p_insz)--; (*p_outsz)--; } @@ -42,15 +45,24 @@ static bool c32toascii( char32_t c = **p_inbuf; if(c > 127) return false; - **p_outbuf = c; + + if(p_outbuf) { + **p_outbuf = c; + (*p_outbuf)++; + } (*p_inbuf)++; - (*p_outbuf)++; (*p_insz)--; (*p_outsz)--; } return true; } + +_PDCLIB_charcodec _PDCLIB_ascii_codec = { + .__mbstoc32s = asciitoc32, + .__c32stombs = c32toascii, +}; + #endif #ifdef TEST diff --git a/opt/basecodecs/_PDCLIB_latin1.c b/opt/basecodecs/_PDCLIB_latin1.c index 73844f4..f78574f 100644 --- a/opt/basecodecs/_PDCLIB_latin1.c +++ b/opt/basecodecs/_PDCLIB_latin1.c @@ -18,10 +18,13 @@ static bool latin1toc32( { while(*p_outsz && *p_insz) { unsigned char c = **p_inbuf; - **p_outbuf = c; + + if(p_outbuf) { + **p_outbuf = c; + (*p_outbuf)++; + } (*p_inbuf)++; - (*p_outbuf)++; (*p_insz)--; (*p_outsz)--; } @@ -40,15 +43,24 @@ static bool c32tolatin1( char32_t c = **p_inbuf; if(c > 255) return false; - **p_outbuf = c; + + if(p_outbuf) { + **p_outbuf = c; + (*p_outbuf)++; + } (*p_inbuf)++; - (*p_outbuf)++; (*p_insz)--; (*p_outsz)--; } return true; } + +_PDCLIB_charcodec _PDCLIB_latin1_codec = { + .__mbstoc32s = latin1toc32, + .__c32stombs = c32tolatin1, +}; + #endif #ifdef TEST diff --git a/opt/basecodecs/_PDCLIB_utf8.c b/opt/basecodecs/_PDCLIB_utf8.c index a88f964..8183aef 100644 --- a/opt/basecodecs/_PDCLIB_utf8.c +++ b/opt/basecodecs/_PDCLIB_utf8.c @@ -44,11 +44,12 @@ end_conversion: \ goto end_conversion; \ } while(0) -#define OUT32(_c) do { \ - (*((*p_outbuf)++)) = (_c); \ - (*p_outsz)--; \ - _PDCLIB_UNDEFINED(accum); \ - state = DecStart; \ +#define OUT32(_c) do { \ + if(p_outbuf) \ + (*((*p_outbuf)++)) = (_c); \ + (*p_outsz)--; \ + _PDCLIB_UNDEFINED(accum); \ + state = DecStart; \ } while(0) #define CHECK_CONTINUATION \ do { if((c & 0xC0) != 0x80) return false; } while(0) @@ -172,20 +173,20 @@ static bool c32toutf8( { START_CONVERSION while(*p_outsz) { - char *c8 = *p_outbuf; + unsigned char outc; switch(state) { case Enc3R: - *c8 = 0x80 | ((accum >> 12) & 0x3F); + outc = 0x80 | ((accum >> 12) & 0x3F); state = Enc2R; break; case Enc2R: - *c8 = 0x80 | ((accum >> 6) & 0x3F); + outc = 0x80 | ((accum >> 6) & 0x3F); state = Enc1R; break; case Enc1R: - *c8 = 0x80 | (accum & 0x3F); + outc = 0x80 | (accum & 0x3F); state = EncStart; _PDCLIB_UNDEFINED(accum); break; @@ -199,17 +200,17 @@ static bool c32toutf8( (*p_insz)--; if(accum <= 0x7F) { - *c8 = accum; + outc = accum; state = EncStart; _PDCLIB_UNDEFINED(accum); } else if(accum <= 0x7FF) { - *c8 = 0xC0 | (accum >> 6); + outc = 0xC0 | (accum >> 6); state = Enc1R; } else if(accum <= 0xFFFF) { - *c8 = 0xE0 | (accum >> 12); + outc = 0xE0 | (accum >> 12); state = Enc2R; } else if(accum <= 0x10FFFF) { - *c8 = 0xF0 | (accum >> 18); + outc = 0xF0 | (accum >> 18); state = Enc3R; } else { FINISH(false); @@ -217,12 +218,20 @@ static bool c32toutf8( break; } - - (*p_outbuf)++; + if(p_outbuf) { + **p_outbuf = outc; + (*p_outbuf)++; + } (*p_outsz)--; } END_CONVERSION; } + +_PDCLIB_charcodec _PDCLIB_utf8_codec = { + .__mbstoc32s = utf8toc32, + .__c32stombs = c32toutf8, +}; + #endif #ifdef TEST -- 2.40.0