pd.if.org Git - pdclib/blob - functions/locale/UnicodeData.py

   1 #!/usr/bin/python
   2 # -*- coding: ascii -*-
   3 # Unicode Data Converter
   4 #
   5 # This file is part of the Public Domain C Library (PDCLib).
   6 # Permission is granted to use, modify, and / or redistribute at will.
   7 """
   8 Converts the character information provdied by Unicode in the UnicodeData.txt
   9 file from the Unicode character database into a table for use by PDCLib.
  10
  11 Usage: Download the UnicodeData.txt file to the same directory as this script
  12 and then run it. Both Python 2 and 3 are supported.
  13
  14 Download the data from
  15     ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
  16
  17 We do some simple "run" compression, because characters in the Unicode Data file
  18 tend to come in groups with the same properties.
  19 """
  20 import os
  21
  22 # MUST BE KEPT SYNCHRONIZED WITH _PDCLIB_locale.h
  23 BIT_ALPHA =   1
  24 BIT_BLANK =   2
  25 BIT_CNTRL =   4
  26 BIT_GRAPH =   8
  27 BIT_PUNCT =  16
  28 BIT_SPACE =  32
  29 BIT_LOWER =  64
  30 BIT_UPPER = 128
  31 BIT_DIGIT = 256
  32 BIT_XDIGT = 512
  33
  34 # Category to bitfield mapping
  35 categories = {
  36     'Lu': BIT_ALPHA | BIT_GRAPH | BIT_UPPER,    # Uppercase
  37     'Ll': BIT_ALPHA | BIT_GRAPH | BIT_LOWER,    # Lowercase
  38     'Lt': BIT_ALPHA | BIT_GRAPH | BIT_UPPER,    # Title case. Upper?
  39     'Lm': BIT_ALPHA | BIT_GRAPH,                # Modifier. Case?
  40     'Lo': BIT_ALPHA | BIT_GRAPH,                # "Other" letter (e.g. Ideograph)
  41     'Nd': BIT_DIGIT | BIT_GRAPH,                # Decimal digit
  42     'Nl': BIT_GRAPH,                            # Letter-like numeric character
  43     'No': BIT_GRAPH,                            # Other numeric
  44     'Pc': BIT_PUNCT | BIT_GRAPH,                # Connecting punctuation
  45     'Pd': BIT_PUNCT | BIT_GRAPH,                # Dash punctuation
  46     'Ps': BIT_PUNCT | BIT_GRAPH,                # Opening punctuation
  47     'Pe': BIT_PUNCT | BIT_GRAPH,                # Closing punctuation
  48     'Pi': BIT_PUNCT | BIT_GRAPH,                # Opening quote
  49     'Pf': BIT_PUNCT | BIT_GRAPH,                # Closing quote
  50     'Po': BIT_PUNCT | BIT_GRAPH,                # Other punctuation
  51     'Sm': BIT_GRAPH,                            # Mathematical symbol
  52     'Sc': BIT_GRAPH,                            # Currency symbol
  53     'Sk': BIT_GRAPH,                            # Non-letterlike modifier symbol
  54     'So': BIT_GRAPH,                            # Other symbol
  55     'Zs': BIT_SPACE,                            # Non-zero-width space character
  56     'Zl': BIT_SPACE,                            # Line separator
  57     'Zp': BIT_SPACE,                            # Paragraph separator
  58     'Cc': BIT_CNTRL,                            # C0/C1 control codes
  59 }
  60
  61 # Characters with special properties
  62 special = {
  63     # Blank characters
  64     0x0020: BIT_SPACE | BIT_BLANK, # space
  65     0x0009: BIT_SPACE | BIT_BLANK, # tab
  66
  67     # Digits
  68     0x0030: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
  69     0x0031: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
  70     0x0032: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
  71     0x0033: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
  72     0x0034: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
  73     0x0035: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
  74     0x0036: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
  75     0x0037: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
  76     0x0038: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
  77     0x0039: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
  78
  79     # A-F (hex uppercase)
  80     0x0041: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
  81     0x0042: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
  82     0x0043: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
  83     0x0044: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
  84     0x0045: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
  85     0x0046: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
  86
  87
  88     # a-f (hex lowercase)
  89     0x0061: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
  90     0x0062: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
  91     0x0063: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
  92     0x0064: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
  93     0x0065: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
  94     0x0066: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
  95 }
  96
  97 class Group:
  98     def __init__(self, start, flags, upper_delta, lower_delta):
  99         self.start = start
 100         self.flags = flags
 101         self.upper_delta = upper_delta
 102         self.lower_delta = lower_delta
 103         self.chars = []
 104
 105     def add_char(self, num, label):
 106         self.chars.append((num, label))
 107
 108     def write_to_file(self, f):
 109         for char in self.chars:
 110             f.write("// %x %s\n" % char)
 111         f.write("    { 0x%X, \t0x%X, \t0x%X, \t%d, \t%d },\n" %
 112             (self.start, len(self.chars), self.flags, self.lower_delta, self.upper_delta))
 113
 114     def next(self):
 115         return self.start + len(self.chars)
 116
 117 groups = []
 118
 119 def add_char(num, upper, lower, bits, label):
 120     upper_delta = upper - num
 121     lower_delta = lower - num
 122
 123     if len(groups) != 0:
 124         cur = groups[-1]
 125         if num == cur.next() and cur.flags == bits and \
 126                 cur.upper_delta == upper_delta and \
 127                 cur.lower_delta == lower_delta:
 128             cur.add_char(num, label)
 129             return
 130
 131     g = Group(num, bits, upper_delta, lower_delta)
 132     g.add_char(num, label)
 133     groups.append(g)
 134
 135 in_file  = open('UnicodeData.txt', 'r')
 136 out_file = open('_PDCLIB_unicodedata.c', 'w')
 137 try:
 138     for line in in_file:
 139         (num_hex, name, category, combining_class, bidi_class, decomposition,
 140          numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com,
 141          upper_case_hex, lower_case_hex, title_case_hex) = line.split(";")
 142
 143         num        = int(num_hex, 16)
 144         upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num
 145         lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num
 146         bits = special.get(num, categories.get(category, 0))
 147
 148         if upper_case == 0 and lower_case == 0 and bits == 0:
 149             continue
 150
 151         add_char(num, upper_case, lower_case, bits, name)
 152
 153     out_file.write("""
 154 /* Unicode Character Information ** AUTOMATICALLY GENERATED FILE **
 155  *
 156  * This file is part of the PDCLib public domain C Library, but is automatically
 157  * generated from the Unicode character data information file found at
 158  *   ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
 159  *
 160  * As a result, the licensing that applies to that file also applies to this
 161  * file. The licensing which applies to the Unicode character data can be found
 162  * in Exhibit 1 of the Unicode Terms of Use, found at
 163  *   http://www.unicode.org/copyright.html#Exhibit1
 164  */
 165  #ifndef REGTEST
 166  #include <_PDCLIB_locale.h>
 167
 168 const _PDCLIB_wcinfo_t _PDCLIB_wcinfo[] = {
 169 //   { value, \tlength, \tflags,\tlower,\tupper\t}, // name
 170  """)
 171     for g in groups:
 172         g.write_to_file(out_file)
 173     out_file.write('};\n\n')
 174     out_file.write("""
 175 const size_t _PDCLIB_wcinfo_size = sizeof(_PDCLIB_wcinfo) / sizeof(_PDCLIB_wcinfo[0]);
 176 #endif
 177
 178 #ifdef TEST
 179 #include "_PDCLIB_test.h"
 180 int main( void )
 181 {
 182     return TEST_RESULTS;
 183 }
 184 #endif
 185
 186 """)
 187 except:
 188     in_file.close()
 189     out_file.close()
 190     os.remove('_PDCLIB_unicodedata.c')
 191     raise
 192 else:
 193     in_file.close()
 194     out_file.close()