X-Git-Url: https://pd.if.org/git/?p=pdclib.old;a=blobdiff_plain;f=functions%2Flocale%2FUnicodeData.py;h=4b9164a750afc2efd5abf0b701561421be6d69ec;hp=e31ec2e29f02fb2b5022a23a7cb0918273b20e40;hb=8208319e85b55e47e65de1c16d78681915057120;hpb=6d3cc0170165878b21ad1cacdb68bbde363581ab

diff --git a/functions/locale/UnicodeData.py b/functions/locale/UnicodeData.py
index e31ec2e..4b9164a 100644
--- a/functions/locale/UnicodeData.py
+++ b/functions/locale/UnicodeData.py
@@ -1,5 +1,5 @@
 #!/usr/bin/python
-# -*- coding: <encoding name> -*-
+# -*- coding: ascii -*-
 # Unicode Data Converter
 #
 # This file is part of the Public Domain C Library (PDCLib).
@@ -13,6 +13,9 @@ and then run it. Both Python 2 and 3 are supported.
 
 Download the data from
     ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
+
+We do some simple "run" compression, because characters in the Unicode Data file
+tend to come in groups with the same properties.
 """
 import os
 
@@ -49,14 +52,18 @@ categories = {
     'Sc': BIT_GRAPH,                            # Currency symbol
     'Sk': BIT_GRAPH,                            # Non-letterlike modifier symbol
     'So': BIT_GRAPH,                            # Other symbol
-    'Zs': BIT_SPACE | BIT_GRAPH | BIT_BLANK,    # Non-zero-width space character
-    'Zl': BIT_SPACE | BIT_GRAPH,                # Line separator
-    'Zp': BIT_SPACE | BIT_GRAPH,                # Paragraph separator
+    'Zs': BIT_SPACE,                            # Non-zero-width space character
+    'Zl': BIT_SPACE,                            # Line separator
+    'Zp': BIT_SPACE,                            # Paragraph separator
     'Cc': BIT_CNTRL,                            # C0/C1 control codes
 }
 
 # Characters with special properties
 special = {
+    # Blank characters
+    0x0020: BIT_SPACE | BIT_BLANK, # space
+    0x0009: BIT_SPACE | BIT_BLANK, # tab
+
     # Digits
     0x0030: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
     0x0031: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
@@ -87,9 +94,62 @@ special = {
     0x0066: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
 }
 
+class Group:
+    def __init__(self, start, flags, upper_delta, lower_delta):
+        self.start = start
+        self.flags = flags
+        self.upper_delta = upper_delta
+        self.lower_delta = lower_delta
+        self.chars = []
+
+    def add_char(self, num, label):
+        self.chars.append((num, label))
+
+    def write_to_file(self, f):
+        for char in self.chars:
+            f.write("// %x %s\n" % char)
+        f.write("    { 0x%X, \t0x%X, \t0x%X, \t%d, \t%d },\n" %
+            (self.start, len(self.chars), self.flags, self.lower_delta, self.upper_delta))
+
+    def next(self):
+        return self.start + len(self.chars)
+
+groups = []
+
+def add_char(num, upper, lower, bits, label):
+    upper_delta = upper - num
+    lower_delta = lower - num
+
+    if len(groups) != 0:
+        cur = groups[-1]
+        if num == cur.next() and cur.flags == bits and \
+                cur.upper_delta == upper_delta and \
+                cur.lower_delta == lower_delta:
+            cur.add_char(num, label)
+            return
+
+    g = Group(num, bits, upper_delta, lower_delta)
+    g.add_char(num, label)
+    groups.append(g)
+
 in_file  = open('UnicodeData.txt', 'r')
 out_file = open('_PDCLIB_unicodedata.c', 'w')
 try:
+    for line in in_file:
+        (num_hex, name, category, combining_class, bidi_class, decomposition,
+         numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com, 
+         upper_case_hex, lower_case_hex, title_case_hex) = line.split(";")
+
+        num        = int(num_hex, 16)
+        upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num
+        lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num
+        bits = special.get(num, categories.get(category, 0))
+
+        if upper_case == 0 and lower_case == 0 and bits == 0:
+            continue
+
+        add_char(num, upper_case, lower_case, bits, name)
+
     out_file.write("""
 /* Unicode Character Information ** AUTOMATICALLY GENERATED FILE **
  *
@@ -105,27 +165,14 @@ try:
  #ifndef REGTEST
  #include <_PDCLIB_locale.h>
 
- _PDCLIB_wcinfo_t _PDCLIB_wcinfo[] = {
-//   { value,\tflags,\tlower,\tupper\t}, // name
+const _PDCLIB_wcinfo_t _PDCLIB_wcinfo[] = {
+//   { value, \tlength, \tflags,\tlower,\tupper\t}, // name
  """)
-    for line in in_file:
-        (num_hex, name, category, combining_class, bidi_class, decomposition,
-         numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com, 
-         upper_case_hex, lower_case_hex, title_case_hex) = line.split(";")
-
-        num       = int(num_hex, 16)
-        upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num
-        lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num
-        bits = special.get(num, categories.get(category, 0))
-
-        if upper_case == 0 and lower_case == 0 and bits == 0:
-            continue
-
-        out_file.write("    { 0x%X,\t0x%X,\t0x%X,\t0x%X }, // %s\n" % (
-            num, bits, lower_case, upper_case, name))
+    for g in groups:
+        g.write_to_file(out_file)
     out_file.write('};\n\n')
     out_file.write("""
-size_t _PDCLIB_wcinfo_size = sizeof(_PDCLIB_wcinfo) / sizeof(_PDCLIB_wcinfo[0]);
+const size_t _PDCLIB_wcinfo_size = sizeof(_PDCLIB_wcinfo) / sizeof(_PDCLIB_wcinfo[0]);
 #endif
 
 #ifdef TEST