X-Git-Url: https://pd.if.org/git/?p=pdclib;a=blobdiff_plain;f=functions%2Flocale%2FUnicodeData.py;h=e0f87e58c159e5b359cc5f33e8e96766138306b8;hp=bb557872a4ea30e640ee15465519ed0eb7be735a;hb=abc15df6b9fae3374d24c7cf5c3ab94c605b2a6d;hpb=8894c921674bb116d0a7b8f23a55311e7a768019 diff --git a/functions/locale/UnicodeData.py b/functions/locale/UnicodeData.py index bb55787..e0f87e5 100644 --- a/functions/locale/UnicodeData.py +++ b/functions/locale/UnicodeData.py @@ -1,194 +1,194 @@ -#!/usr/bin/python -# -*- coding: ascii -*- -# Unicode Data Converter -# -# This file is part of the Public Domain C Library (PDCLib). -# Permission is granted to use, modify, and / or redistribute at will. -""" -Converts the character information provdied by Unicode in the UnicodeData.txt -file from the Unicode character database into a table for use by PDCLib. - -Usage: Download the UnicodeData.txt file to the same directory as this script -and then run it. Both Python 2 and 3 are supported. - -Download the data from - ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt - -We do some simple "run" compression, because characters in the Unicode Data file -tend to come in groups with the same properties. -""" -import os - -# MUST BE KEPT SYNCHRONIZED WITH _PDCLIB_locale.h -BIT_ALPHA = 1 -BIT_BLANK = 2 -BIT_CNTRL = 4 -BIT_GRAPH = 8 -BIT_PUNCT = 16 -BIT_SPACE = 32 -BIT_LOWER = 64 -BIT_UPPER = 128 -BIT_DIGIT = 256 -BIT_XDIGT = 512 - -# Category to bitfield mapping -categories = { - 'Lu': BIT_ALPHA | BIT_GRAPH | BIT_UPPER, # Uppercase - 'Ll': BIT_ALPHA | BIT_GRAPH | BIT_LOWER, # Lowercase - 'Lt': BIT_ALPHA | BIT_GRAPH | BIT_UPPER, # Title case. Upper? - 'Lm': BIT_ALPHA | BIT_GRAPH, # Modifier. Case? - 'Lo': BIT_ALPHA | BIT_GRAPH, # "Other" letter (e.g. Ideograph) - 'Nd': BIT_DIGIT | BIT_GRAPH, # Decimal digit - 'Nl': BIT_GRAPH, # Letter-like numeric character - 'No': BIT_GRAPH, # Other numeric - 'Pc': BIT_PUNCT | BIT_GRAPH, # Connecting punctuation - 'Pd': BIT_PUNCT | BIT_GRAPH, # Dash punctuation - 'Ps': BIT_PUNCT | BIT_GRAPH, # Opening punctuation - 'Pe': BIT_PUNCT | BIT_GRAPH, # Closing punctuation - 'Pi': BIT_PUNCT | BIT_GRAPH, # Opening quote - 'Pf': BIT_PUNCT | BIT_GRAPH, # Closing quote - 'Po': BIT_PUNCT | BIT_GRAPH, # Other punctuation - 'Sm': BIT_GRAPH, # Mathematical symbol - 'Sc': BIT_GRAPH, # Currency symbol - 'Sk': BIT_GRAPH, # Non-letterlike modifier symbol - 'So': BIT_GRAPH, # Other symbol - 'Zs': BIT_SPACE, # Non-zero-width space character - 'Zl': BIT_SPACE, # Line separator - 'Zp': BIT_SPACE, # Paragraph separator - 'Cc': BIT_CNTRL, # C0/C1 control codes -} - -# Characters with special properties -special = { - # Blank characters - 0x0020: BIT_SPACE | BIT_BLANK, # space - 0x0009: BIT_SPACE | BIT_BLANK, # tab - - # Digits - 0x0030: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, - 0x0031: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, - 0x0032: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, - 0x0033: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, - 0x0034: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, - 0x0035: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, - 0x0036: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, - 0x0037: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, - 0x0038: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, - 0x0039: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, - - # A-F (hex uppercase) - 0x0041: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, - 0x0042: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, - 0x0043: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, - 0x0044: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, - 0x0045: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, - 0x0046: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, - - - # a-f (hex lowercase) - 0x0061: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, - 0x0062: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, - 0x0063: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, - 0x0064: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, - 0x0065: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, - 0x0066: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, -} - -class Group: - def __init__(self, start, flags, upper_delta, lower_delta): - self.start = start - self.flags = flags - self.upper_delta = upper_delta - self.lower_delta = lower_delta - self.chars = [] - - def add_char(self, num, label): - self.chars.append((num, label)) - - def write_to_file(self, f): - for char in self.chars: - f.write("// %x %s\n" % char) - f.write(" { 0x%X, \t0x%X, \t0x%X, \t%d, \t%d },\n" % - (self.start, len(self.chars), self.flags, self.lower_delta, self.upper_delta)) - - def next(self): - return self.start + len(self.chars) - -groups = [] - -def add_char(num, upper, lower, bits, label): - upper_delta = upper - num - lower_delta = lower - num - - if len(groups) != 0: - cur = groups[-1] - if num == cur.next() and cur.flags == bits and \ - cur.upper_delta == upper_delta and \ - cur.lower_delta == lower_delta: - cur.add_char(num, label) - return - - g = Group(num, bits, upper_delta, lower_delta) - g.add_char(num, label) - groups.append(g) - -in_file = open('UnicodeData.txt', 'r') -out_file = open('_PDCLIB_unicodedata.c', 'w') -try: - for line in in_file: - (num_hex, name, category, combining_class, bidi_class, decomposition, - numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com, - upper_case_hex, lower_case_hex, title_case_hex) = line.split(";") - - num = int(num_hex, 16) - upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num - lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num - bits = special.get(num, categories.get(category, 0)) - - if upper_case == 0 and lower_case == 0 and bits == 0: - continue - - add_char(num, upper_case, lower_case, bits, name) - - out_file.write(""" -/* Unicode Character Information ** AUTOMATICALLY GENERATED FILE ** - * - * This file is part of the PDCLib public domain C Library, but is automatically - * generated from the Unicode character data information file found at - * ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt - * - * As a result, the licensing that applies to that file also applies to this - * file. The licensing which applies to the Unicode character data can be found - * in Exhibit 1 of the Unicode Terms of Use, found at - * http://www.unicode.org/copyright.html#Exhibit1 - */ - #ifndef REGTEST - #include <_PDCLIB_locale.h> - -const _PDCLIB_wcinfo_t _PDCLIB_wcinfo[] = { -// { value, \tlength, \tflags,\tlower,\tupper\t}, // name - """) - for g in groups: - g.write_to_file(out_file) - out_file.write('};\n\n') - out_file.write(""" -const size_t _PDCLIB_wcinfo_size = sizeof(_PDCLIB_wcinfo) / sizeof(_PDCLIB_wcinfo[0]); -#endif - -#ifdef TEST -#include "_PDCLIB_test.h" -int main( void ) -{ - return TEST_RESULTS; -} -#endif - -""") -except: - in_file.close() - out_file.close() - os.remove('_PDCLIB_unicodedata.c') - raise -else: - in_file.close() - out_file.close() +#!/usr/bin/python +# -*- coding: ascii -*- +# Unicode Data Converter +# +# This file is part of the Public Domain C Library (PDCLib). +# Permission is granted to use, modify, and / or redistribute at will. +""" +Converts the character information provdied by Unicode in the UnicodeData.txt +file from the Unicode character database into a table for use by PDCLib. + +Usage: Download the UnicodeData.txt file to the same directory as this script +and then run it. Both Python 2 and 3 are supported. + +Download the data from + ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt + +We do some simple "run" compression, because characters in the Unicode Data file +tend to come in groups with the same properties. +""" +import os + +# MUST BE KEPT SYNCHRONIZED WITH _PDCLIB_locale.h +BIT_ALPHA = 1 +BIT_BLANK = 2 +BIT_CNTRL = 4 +BIT_GRAPH = 8 +BIT_PUNCT = 16 +BIT_SPACE = 32 +BIT_LOWER = 64 +BIT_UPPER = 128 +BIT_DIGIT = 256 +BIT_XDIGT = 512 + +# Category to bitfield mapping +categories = { + 'Lu': BIT_ALPHA | BIT_GRAPH | BIT_UPPER, # Uppercase + 'Ll': BIT_ALPHA | BIT_GRAPH | BIT_LOWER, # Lowercase + 'Lt': BIT_ALPHA | BIT_GRAPH | BIT_UPPER, # Title case. Upper? + 'Lm': BIT_ALPHA | BIT_GRAPH, # Modifier. Case? + 'Lo': BIT_ALPHA | BIT_GRAPH, # "Other" letter (e.g. Ideograph) + 'Nd': BIT_DIGIT | BIT_GRAPH, # Decimal digit + 'Nl': BIT_GRAPH, # Letter-like numeric character + 'No': BIT_GRAPH, # Other numeric + 'Pc': BIT_PUNCT | BIT_GRAPH, # Connecting punctuation + 'Pd': BIT_PUNCT | BIT_GRAPH, # Dash punctuation + 'Ps': BIT_PUNCT | BIT_GRAPH, # Opening punctuation + 'Pe': BIT_PUNCT | BIT_GRAPH, # Closing punctuation + 'Pi': BIT_PUNCT | BIT_GRAPH, # Opening quote + 'Pf': BIT_PUNCT | BIT_GRAPH, # Closing quote + 'Po': BIT_PUNCT | BIT_GRAPH, # Other punctuation + 'Sm': BIT_GRAPH, # Mathematical symbol + 'Sc': BIT_GRAPH, # Currency symbol + 'Sk': BIT_GRAPH, # Non-letterlike modifier symbol + 'So': BIT_GRAPH, # Other symbol + 'Zs': BIT_SPACE, # Non-zero-width space character + 'Zl': BIT_SPACE, # Line separator + 'Zp': BIT_SPACE, # Paragraph separator + 'Cc': BIT_CNTRL, # C0/C1 control codes +} + +# Characters with special properties +special = { + # Blank characters + 0x0020: BIT_SPACE | BIT_BLANK, # space + 0x0009: BIT_SPACE | BIT_BLANK, # tab + + # Digits + 0x0030: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0031: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0032: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0033: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0034: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0035: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0036: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0037: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0038: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0039: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + + # A-F (hex uppercase) + 0x0041: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0042: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0043: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0044: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0045: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0046: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + + + # a-f (hex lowercase) + 0x0061: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0062: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0063: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0064: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0065: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0066: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, +} + +class Group: + def __init__(self, start, flags, upper_delta, lower_delta): + self.start = start + self.flags = flags + self.upper_delta = upper_delta + self.lower_delta = lower_delta + self.chars = [] + + def add_char(self, num, label): + self.chars.append((num, label)) + + def write_to_file(self, f): + for char in self.chars: + f.write("// %x %s\n" % char) + f.write(" { 0x%X, \t0x%X, \t0x%X, \t%d, \t%d },\n" % + (self.start, len(self.chars), self.flags, self.lower_delta, self.upper_delta)) + + def next(self): + return self.start + len(self.chars) + +groups = [] + +def add_char(num, upper, lower, bits, label): + upper_delta = upper - num + lower_delta = lower - num + + if len(groups) != 0: + cur = groups[-1] + if num == cur.next() and cur.flags == bits and \ + cur.upper_delta == upper_delta and \ + cur.lower_delta == lower_delta: + cur.add_char(num, label) + return + + g = Group(num, bits, upper_delta, lower_delta) + g.add_char(num, label) + groups.append(g) + +in_file = open('UnicodeData.txt', 'r') +out_file = open('_PDCLIB_unicodedata.c', 'w') +try: + for line in in_file: + (num_hex, name, category, combining_class, bidi_class, decomposition, + numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com, + upper_case_hex, lower_case_hex, title_case_hex) = line.split(";") + + num = int(num_hex, 16) + upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num + lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num + bits = special.get(num, categories.get(category, 0)) + + if upper_case == 0 and lower_case == 0 and bits == 0: + continue + + add_char(num, upper_case, lower_case, bits, name) + + out_file.write(""" +/* Unicode Character Information ** AUTOMATICALLY GENERATED FILE ** + * + * This file is part of the PDCLib public domain C Library, but is automatically + * generated from the Unicode character data information file found at + * ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt + * + * As a result, the licensing that applies to that file also applies to this + * file. The licensing which applies to the Unicode character data can be found + * in Exhibit 1 of the Unicode Terms of Use, found at + * http://www.unicode.org/copyright.html#Exhibit1 + */ + #ifndef REGTEST + #include <_PDCLIB_locale.h> + +const _PDCLIB_wcinfo_t _PDCLIB_wcinfo[] = { +// { value, \tlength, \tflags,\tlower,\tupper\t}, // name + """) + for g in groups: + g.write_to_file(out_file) + out_file.write('};\n\n') + out_file.write(""" +const size_t _PDCLIB_wcinfo_size = sizeof(_PDCLIB_wcinfo) / sizeof(_PDCLIB_wcinfo[0]); +#endif + +#ifdef TEST +#include "_PDCLIB_test.h" +int main( void ) +{ + return TEST_RESULTS; +} +#endif + +""") +except: + in_file.close() + out_file.close() + os.remove('_PDCLIB_unicodedata.c') + raise +else: + in_file.close() + out_file.close()