#!/usr/bin/python\r
-# -*- coding: <encoding name> -*-\r
+# -*- coding: ascii -*-\r
# Unicode Data Converter\r
#\r
# This file is part of the Public Domain C Library (PDCLib).\r
\r
Download the data from\r
ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt\r
+\r
+We do some simple "run" compression, because characters in the Unicode Data file\r
+tend to come in groups with the same properties.\r
"""\r
import os\r
\r
0x0066: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,\r
}\r
\r
+class Group:\r
+ def __init__(self, start, flags, upper_delta, lower_delta):\r
+ self.start = start\r
+ self.flags = flags\r
+ self.upper_delta = upper_delta\r
+ self.lower_delta = lower_delta\r
+ self.chars = []\r
+\r
+ def add_char(self, num, label):\r
+ self.chars.append((num, label))\r
+\r
+ def write_to_file(self, f):\r
+ for char in self.chars:\r
+ f.write("// %x %s\n" % char)\r
+ f.write(" { 0x%X, \t0x%X, \t0x%X, \t%d, \t%d },\n" %\r
+ (self.start, len(self.chars), self.flags, self.lower_delta, self.upper_delta))\r
+\r
+ def next(self):\r
+ return self.start + len(self.chars)\r
+\r
+groups = []\r
+\r
+def add_char(num, upper, lower, bits, label):\r
+ upper_delta = upper - num\r
+ lower_delta = lower - num\r
+\r
+ if len(groups) != 0:\r
+ cur = groups[-1]\r
+ if num == cur.next() and cur.flags == bits and \\r
+ cur.upper_delta == upper_delta and \\r
+ cur.lower_delta == lower_delta:\r
+ cur.add_char(num, label)\r
+ return\r
+\r
+ g = Group(num, bits, upper_delta, lower_delta)\r
+ g.add_char(num, label)\r
+ groups.append(g)\r
+\r
in_file = open('UnicodeData.txt', 'r')\r
out_file = open('_PDCLIB_unicodedata.c', 'w')\r
try:\r
+ for line in in_file:\r
+ (num_hex, name, category, combining_class, bidi_class, decomposition,\r
+ numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com, \r
+ upper_case_hex, lower_case_hex, title_case_hex) = line.split(";")\r
+\r
+ num = int(num_hex, 16)\r
+ upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num\r
+ lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num\r
+ bits = special.get(num, categories.get(category, 0))\r
+\r
+ if upper_case == 0 and lower_case == 0 and bits == 0:\r
+ continue\r
+\r
+ add_char(num, upper_case, lower_case, bits, name)\r
+\r
out_file.write("""\r
/* Unicode Character Information ** AUTOMATICALLY GENERATED FILE **\r
*\r
#ifndef REGTEST\r
#include <_PDCLIB_locale.h>\r
\r
- _PDCLIB_wcinfo_t _PDCLIB_wcinfo[] = {\r
-// { value,\tflags,\tlower,\tupper\t}, // name\r
+const _PDCLIB_wcinfo_t _PDCLIB_wcinfo[] = {\r
+// { value, \tlength, \tflags,\tlower,\tupper\t}, // name\r
""")\r
- for line in in_file:\r
- (num_hex, name, category, combining_class, bidi_class, decomposition,\r
- numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com, \r
- upper_case_hex, lower_case_hex, title_case_hex) = line.split(";")\r
-\r
- num = int(num_hex, 16)\r
- upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num\r
- lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num\r
- bits = special.get(num, categories.get(category, 0))\r
-\r
- if upper_case == 0 and lower_case == 0 and bits == 0:\r
- continue\r
-\r
- out_file.write(" { 0x%X,\t0x%X,\t0x%X,\t0x%X }, // %s\n" % (\r
- num, bits, lower_case, upper_case, name))\r
+ for g in groups:\r
+ g.write_to_file(out_file)\r
out_file.write('};\n\n')\r
out_file.write("""\r
-size_t _PDCLIB_wcinfo_size = sizeof(_PDCLIB_wcinfo) / sizeof(_PDCLIB_wcinfo[0]);\r
+const size_t _PDCLIB_wcinfo_size = sizeof(_PDCLIB_wcinfo) / sizeof(_PDCLIB_wcinfo[0]);\r
#endif\r
\r
#ifdef TEST\r