]> pd.if.org Git - pdclib/blob - functions/locale/UnicodeData.py
dos2unix
[pdclib] / functions / locale / UnicodeData.py
1 #!/usr/bin/python
2 # -*- coding: ascii -*-
3 # Unicode Data Converter
4 #
5 # This file is part of the Public Domain C Library (PDCLib).
6 # Permission is granted to use, modify, and / or redistribute at will.
7 """
8 Converts the character information provdied by Unicode in the UnicodeData.txt
9 file from the Unicode character database into a table for use by PDCLib.
10
11 Usage: Download the UnicodeData.txt file to the same directory as this script 
12 and then run it. Both Python 2 and 3 are supported.
13
14 Download the data from
15     ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
16
17 We do some simple "run" compression, because characters in the Unicode Data file
18 tend to come in groups with the same properties.
19 """
20 import os
21
22 # MUST BE KEPT SYNCHRONIZED WITH _PDCLIB_locale.h
23 BIT_ALPHA =   1
24 BIT_BLANK =   2
25 BIT_CNTRL =   4
26 BIT_GRAPH =   8
27 BIT_PUNCT =  16
28 BIT_SPACE =  32
29 BIT_LOWER =  64
30 BIT_UPPER = 128
31 BIT_DIGIT = 256
32 BIT_XDIGT = 512
33
34 # Category to bitfield mapping
35 categories = {
36     'Lu': BIT_ALPHA | BIT_GRAPH | BIT_UPPER,    # Uppercase
37     'Ll': BIT_ALPHA | BIT_GRAPH | BIT_LOWER,    # Lowercase
38     'Lt': BIT_ALPHA | BIT_GRAPH | BIT_UPPER,    # Title case. Upper?
39     'Lm': BIT_ALPHA | BIT_GRAPH,                # Modifier. Case?
40     'Lo': BIT_ALPHA | BIT_GRAPH,                # "Other" letter (e.g. Ideograph)
41     'Nd': BIT_DIGIT | BIT_GRAPH,                # Decimal digit
42     'Nl': BIT_GRAPH,                            # Letter-like numeric character
43     'No': BIT_GRAPH,                            # Other numeric
44     'Pc': BIT_PUNCT | BIT_GRAPH,                # Connecting punctuation
45     'Pd': BIT_PUNCT | BIT_GRAPH,                # Dash punctuation
46     'Ps': BIT_PUNCT | BIT_GRAPH,                # Opening punctuation
47     'Pe': BIT_PUNCT | BIT_GRAPH,                # Closing punctuation
48     'Pi': BIT_PUNCT | BIT_GRAPH,                # Opening quote
49     'Pf': BIT_PUNCT | BIT_GRAPH,                # Closing quote
50     'Po': BIT_PUNCT | BIT_GRAPH,                # Other punctuation
51     'Sm': BIT_GRAPH,                            # Mathematical symbol
52     'Sc': BIT_GRAPH,                            # Currency symbol
53     'Sk': BIT_GRAPH,                            # Non-letterlike modifier symbol
54     'So': BIT_GRAPH,                            # Other symbol
55     'Zs': BIT_SPACE,                            # Non-zero-width space character
56     'Zl': BIT_SPACE,                            # Line separator
57     'Zp': BIT_SPACE,                            # Paragraph separator
58     'Cc': BIT_CNTRL,                            # C0/C1 control codes
59 }
60
61 # Characters with special properties
62 special = {
63     # Blank characters
64     0x0020: BIT_SPACE | BIT_BLANK, # space
65     0x0009: BIT_SPACE | BIT_BLANK, # tab
66
67     # Digits
68     0x0030: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
69     0x0031: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
70     0x0032: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
71     0x0033: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
72     0x0034: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
73     0x0035: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
74     0x0036: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
75     0x0037: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
76     0x0038: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
77     0x0039: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
78
79     # A-F (hex uppercase)
80     0x0041: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
81     0x0042: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
82     0x0043: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
83     0x0044: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
84     0x0045: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
85     0x0046: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
86
87
88     # a-f (hex lowercase)
89     0x0061: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
90     0x0062: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
91     0x0063: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
92     0x0064: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
93     0x0065: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
94     0x0066: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
95 }
96
97 class Group:
98     def __init__(self, start, flags, upper_delta, lower_delta):
99         self.start = start
100         self.flags = flags
101         self.upper_delta = upper_delta
102         self.lower_delta = lower_delta
103         self.chars = []
104
105     def add_char(self, num, label):
106         self.chars.append((num, label))
107
108     def write_to_file(self, f):
109         for char in self.chars:
110             f.write("// %x %s\n" % char)
111         f.write("    { 0x%X, \t0x%X, \t0x%X, \t%d, \t%d },\n" %
112             (self.start, len(self.chars), self.flags, self.lower_delta, self.upper_delta))
113
114     def next(self):
115         return self.start + len(self.chars)
116
117 groups = []
118
119 def add_char(num, upper, lower, bits, label):
120     upper_delta = upper - num
121     lower_delta = lower - num
122
123     if len(groups) != 0:
124         cur = groups[-1]
125         if num == cur.next() and cur.flags == bits and \
126                 cur.upper_delta == upper_delta and \
127                 cur.lower_delta == lower_delta:
128             cur.add_char(num, label)
129             return
130
131     g = Group(num, bits, upper_delta, lower_delta)
132     g.add_char(num, label)
133     groups.append(g)
134
135 in_file  = open('UnicodeData.txt', 'r')
136 out_file = open('_PDCLIB_unicodedata.c', 'w')
137 try:
138     for line in in_file:
139         (num_hex, name, category, combining_class, bidi_class, decomposition,
140          numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com, 
141          upper_case_hex, lower_case_hex, title_case_hex) = line.split(";")
142
143         num        = int(num_hex, 16)
144         upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num
145         lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num
146         bits = special.get(num, categories.get(category, 0))
147
148         if upper_case == 0 and lower_case == 0 and bits == 0:
149             continue
150
151         add_char(num, upper_case, lower_case, bits, name)
152
153     out_file.write("""
154 /* Unicode Character Information ** AUTOMATICALLY GENERATED FILE **
155  *
156  * This file is part of the PDCLib public domain C Library, but is automatically
157  * generated from the Unicode character data information file found at
158  *   ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
159  * 
160  * As a result, the licensing that applies to that file also applies to this 
161  * file. The licensing which applies to the Unicode character data can be found 
162  * in Exhibit 1 of the Unicode Terms of Use, found at
163  *   http://www.unicode.org/copyright.html#Exhibit1
164  */
165  #ifndef REGTEST
166  #include <_PDCLIB_locale.h>
167
168 const _PDCLIB_wcinfo_t _PDCLIB_wcinfo[] = {
169 //   { value, \tlength, \tflags,\tlower,\tupper\t}, // name
170  """)
171     for g in groups:
172         g.write_to_file(out_file)
173     out_file.write('};\n\n')
174     out_file.write("""
175 const size_t _PDCLIB_wcinfo_size = sizeof(_PDCLIB_wcinfo) / sizeof(_PDCLIB_wcinfo[0]);
176 #endif
177
178 #ifdef TEST
179 #include "_PDCLIB_test.h"
180 int main( void )
181 {
182     return TEST_RESULTS;
183 }
184 #endif
185
186 """)
187 except:
188     in_file.close()
189     out_file.close()
190     os.remove('_PDCLIB_unicodedata.c')
191     raise
192 else:
193     in_file.close()
194     out_file.close()