X-Git-Url: https://pd.if.org/git/?p=pdclib.old;a=blobdiff_plain;f=functions%2Flocale%2FUnicodeData.py;fp=functions%2Flocale%2FUnicodeData.py;h=4b9164a750afc2efd5abf0b701561421be6d69ec;hp=eb0f3c33137f249a71b6b2cd70b2fff480b5257e;hb=8208319e85b55e47e65de1c16d78681915057120;hpb=4ee48c2e350472aa6832594409bbdcf87c0ade54 diff --git a/functions/locale/UnicodeData.py b/functions/locale/UnicodeData.py index eb0f3c3..4b9164a 100644 --- a/functions/locale/UnicodeData.py +++ b/functions/locale/UnicodeData.py @@ -13,6 +13,9 @@ and then run it. Both Python 2 and 3 are supported. Download the data from ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt + +We do some simple "run" compression, because characters in the Unicode Data file +tend to come in groups with the same properties. """ import os @@ -91,9 +94,62 @@ special = { 0x0066: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, } +class Group: + def __init__(self, start, flags, upper_delta, lower_delta): + self.start = start + self.flags = flags + self.upper_delta = upper_delta + self.lower_delta = lower_delta + self.chars = [] + + def add_char(self, num, label): + self.chars.append((num, label)) + + def write_to_file(self, f): + for char in self.chars: + f.write("// %x %s\n" % char) + f.write(" { 0x%X, \t0x%X, \t0x%X, \t%d, \t%d },\n" % + (self.start, len(self.chars), self.flags, self.lower_delta, self.upper_delta)) + + def next(self): + return self.start + len(self.chars) + +groups = [] + +def add_char(num, upper, lower, bits, label): + upper_delta = upper - num + lower_delta = lower - num + + if len(groups) != 0: + cur = groups[-1] + if num == cur.next() and cur.flags == bits and \ + cur.upper_delta == upper_delta and \ + cur.lower_delta == lower_delta: + cur.add_char(num, label) + return + + g = Group(num, bits, upper_delta, lower_delta) + g.add_char(num, label) + groups.append(g) + in_file = open('UnicodeData.txt', 'r') out_file = open('_PDCLIB_unicodedata.c', 'w') try: + for line in in_file: + (num_hex, name, category, combining_class, bidi_class, decomposition, + numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com, + upper_case_hex, lower_case_hex, title_case_hex) = line.split(";") + + num = int(num_hex, 16) + upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num + lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num + bits = special.get(num, categories.get(category, 0)) + + if upper_case == 0 and lower_case == 0 and bits == 0: + continue + + add_char(num, upper_case, lower_case, bits, name) + out_file.write(""" /* Unicode Character Information ** AUTOMATICALLY GENERATED FILE ** * @@ -110,23 +166,10 @@ try: #include <_PDCLIB_locale.h> const _PDCLIB_wcinfo_t _PDCLIB_wcinfo[] = { -// { value,\tflags,\tlower,\tupper\t}, // name +// { value, \tlength, \tflags,\tlower,\tupper\t}, // name """) - for line in in_file: - (num_hex, name, category, combining_class, bidi_class, decomposition, - numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com, - upper_case_hex, lower_case_hex, title_case_hex) = line.split(";") - - num = int(num_hex, 16) - upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num - lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num - bits = special.get(num, categories.get(category, 0)) - - if upper_case == 0 and lower_case == 0 and bits == 0: - continue - - out_file.write(" { 0x%X,\t0x%X,\t0x%X,\t0x%X }, // %s\n" % ( - num, bits, lower_case, upper_case, name)) + for g in groups: + g.write_to_file(out_file) out_file.write('};\n\n') out_file.write(""" const size_t _PDCLIB_wcinfo_size = sizeof(_PDCLIB_wcinfo) / sizeof(_PDCLIB_wcinfo[0]);