X-Git-Url: https://pd.if.org/git/?a=blobdiff_plain;f=functions%2Flocale%2FUnicodeData.py;h=6fe74cb8d90ee263a1681274bf2faa5fc2255548;hb=e83bbf1f89de742ebf07a11984be8d38fd407527;hp=b9057c22fdbc0cfea47424defafbb6326185973a;hpb=efab59138e688d20f8bb7cf6526c3474d617ebe5;p=pdclib diff --git a/functions/locale/UnicodeData.py b/functions/locale/UnicodeData.py index b9057c2..6fe74cb 100644 --- a/functions/locale/UnicodeData.py +++ b/functions/locale/UnicodeData.py @@ -7,6 +7,12 @@ """ Converts the character information provdied by Unicode in the UnicodeData.txt file from the Unicode character database into a table for use by PDCLib. + +Usage: Download the UnicodeData.txt file to the same directory as this script +and then run it. Both Python 2 and 3 are supported. + +Download the data from + ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt """ import os @@ -20,6 +26,7 @@ BIT_SPACE = 32 BIT_LOWER = 64 BIT_UPPER = 128 BIT_DIGIT = 256 +BIT_XDIGT = 512 # Category to bitfield mapping categories = { @@ -42,12 +49,48 @@ categories = { 'Sc': BIT_GRAPH, # Currency symbol 'Sk': BIT_GRAPH, # Non-letterlike modifier symbol 'So': BIT_GRAPH, # Other symbol - 'Zs': BIT_SPACE | BIT_GRAPH | BIT_BLANK, # Non-zero-width space character - 'Zl': BIT_SPACE | BIT_GRAPH, # Line separator - 'Zp': BIT_SPACE | BIT_GRAPH, # Paragraph separator + 'Zs': BIT_SPACE, # Non-zero-width space character + 'Zl': BIT_SPACE, # Line separator + 'Zp': BIT_SPACE, # Paragraph separator 'Cc': BIT_CNTRL, # C0/C1 control codes } +# Characters with special properties +special = { + # Blank characters + 0x0020: BIT_SPACE | BIT_BLANK, # space + 0x0009: BIT_SPACE | BIT_BLANK, # tab + + # Digits + 0x0030: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0031: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0032: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0033: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0034: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0035: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0036: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0037: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0038: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0039: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + + # A-F (hex uppercase) + 0x0041: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0042: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0043: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0044: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0045: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0046: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + + + # a-f (hex lowercase) + 0x0061: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0062: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0063: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0064: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0065: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0066: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, +} + in_file = open('UnicodeData.txt', 'r') out_file = open('_PDCLIB_unicodedata.c', 'w') try: @@ -63,9 +106,10 @@ try: * in Exhibit 1 of the Unicode Terms of Use, found at * http://www.unicode.org/copyright.html#Exhibit1 */ + #ifndef REGTEST #include <_PDCLIB_locale.h> - _PDCLIB_wctype_t _PDCLIB_wctype[] = { + _PDCLIB_wcinfo_t _PDCLIB_wcinfo[] = { // { value,\tflags,\tlower,\tupper\t}, // name """) for line in in_file: @@ -76,7 +120,7 @@ try: num = int(num_hex, 16) upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num - bits = categories.get(category, 0) + bits = special.get(num, categories.get(category, 0)) if upper_case == 0 and lower_case == 0 and bits == 0: continue @@ -84,7 +128,19 @@ try: out_file.write(" { 0x%X,\t0x%X,\t0x%X,\t0x%X }, // %s\n" % ( num, bits, lower_case, upper_case, name)) out_file.write('};\n\n') - out_file.write('size_t _PDCLIB_wctype_size = sizeof(_PDCLIB_wctype) / sizeof(_PDCLIB_wctype[0]);\n\n') + out_file.write(""" +size_t _PDCLIB_wcinfo_size = sizeof(_PDCLIB_wcinfo) / sizeof(_PDCLIB_wcinfo[0]); +#endif + +#ifdef TEST +#include <_PDCLIB_test.h> +int main( void ) +{ + return TEST_RESULTS; +} +#endif + +""") except: in_file.close() out_file.close()