Compress the Unicode data into runs. This results in a significant size reduction.

[pdclib.old] / functions / locale / UnicodeData.py
diff --git a/functions/locale/UnicodeData.py b/functions/locale/UnicodeData.py

index eb0f3c33137f249a71b6b2cd70b2fff480b5257e..4b9164a750afc2efd5abf0b701561421be6d69ec 100644 (file)
--- a/functions/locale/UnicodeData.py
+++ b/functions/locale/UnicodeData.py
@@ -13,6 +13,9 @@ and then run it. Both Python 2 and 3 are supported.
  \r
  Download the data from\r
      ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt\r
+\r
+We do some simple "run" compression, because characters in the Unicode Data file\r
+tend to come in groups with the same properties.\r
  """\r
  import os\r
  \r
@@ -91,9 +94,62 @@ special = {
      0x0066: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,\r
  }\r
  \r
+class Group:\r
+    def __init__(self, start, flags, upper_delta, lower_delta):\r
+        self.start = start\r
+        self.flags = flags\r
+        self.upper_delta = upper_delta\r
+        self.lower_delta = lower_delta\r
+        self.chars = []\r
+\r
+    def add_char(self, num, label):\r
+        self.chars.append((num, label))\r
+\r
+    def write_to_file(self, f):\r
+        for char in self.chars:\r
+            f.write("// %x %s\n" % char)\r
+        f.write("    { 0x%X, \t0x%X, \t0x%X, \t%d, \t%d },\n" %\r
+            (self.start, len(self.chars), self.flags, self.lower_delta, self.upper_delta))\r
+\r
+    def next(self):\r
+        return self.start + len(self.chars)\r
+\r
+groups = []\r
+\r
+def add_char(num, upper, lower, bits, label):\r
+    upper_delta = upper - num\r
+    lower_delta = lower - num\r
+\r
+    if len(groups) != 0:\r
+        cur = groups[-1]\r
+        if num == cur.next() and cur.flags == bits and \\r
+                cur.upper_delta == upper_delta and \\r
+                cur.lower_delta == lower_delta:\r
+            cur.add_char(num, label)\r
+            return\r
+\r
+    g = Group(num, bits, upper_delta, lower_delta)\r
+    g.add_char(num, label)\r
+    groups.append(g)\r
+\r
  in_file  = open('UnicodeData.txt', 'r')\r
  out_file = open('_PDCLIB_unicodedata.c', 'w')\r
  try:\r
+    for line in in_file:\r
+        (num_hex, name, category, combining_class, bidi_class, decomposition,\r
+         numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com, \r
+         upper_case_hex, lower_case_hex, title_case_hex) = line.split(";")\r
+\r
+        num        = int(num_hex, 16)\r
+        upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num\r
+        lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num\r
+        bits = special.get(num, categories.get(category, 0))\r
+\r
+        if upper_case == 0 and lower_case == 0 and bits == 0:\r
+            continue\r
+\r
+        add_char(num, upper_case, lower_case, bits, name)\r
+\r
      out_file.write("""\r
  /* Unicode Character Information ** AUTOMATICALLY GENERATED FILE **\r
   *\r
@@ -110,23 +166,10 @@ try:
   #include <_PDCLIB_locale.h>\r
  \r
  const _PDCLIB_wcinfo_t _PDCLIB_wcinfo[] = {\r
-//   { value,\tflags,\tlower,\tupper\t}, // name\r
+//   { value, \tlength, \tflags,\tlower,\tupper\t}, // name\r
   """)\r
-    for line in in_file:\r
-        (num_hex, name, category, combining_class, bidi_class, decomposition,\r
-         numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com, \r
-         upper_case_hex, lower_case_hex, title_case_hex) = line.split(";")\r
-\r
-        num       = int(num_hex, 16)\r
-        upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num\r
-        lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num\r
-        bits = special.get(num, categories.get(category, 0))\r
-\r
-        if upper_case == 0 and lower_case == 0 and bits == 0:\r
-            continue\r
-\r
-        out_file.write("    { 0x%X,\t0x%X,\t0x%X,\t0x%X }, // %s\n" % (\r
-            num, bits, lower_case, upper_case, name))\r
+    for g in groups:\r
+        g.write_to_file(out_file)\r
      out_file.write('};\n\n')\r
      out_file.write("""\r
  const size_t _PDCLIB_wcinfo_size = sizeof(_PDCLIB_wcinfo) / sizeof(_PDCLIB_wcinfo[0]);\r