2 # -*- coding: ascii -*-
\r
3 # Unicode Data Converter
\r
5 # This file is part of the Public Domain C Library (PDCLib).
\r
6 # Permission is granted to use, modify, and / or redistribute at will.
\r
8 Converts the character information provdied by Unicode in the UnicodeData.txt
\r
9 file from the Unicode character database into a table for use by PDCLib.
\r
11 Usage: Download the UnicodeData.txt file to the same directory as this script
\r
12 and then run it. Both Python 2 and 3 are supported.
\r
14 Download the data from
\r
15 ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
\r
17 We do some simple "run" compression, because characters in the Unicode Data file
\r
18 tend to come in groups with the same properties.
\r
22 # MUST BE KEPT SYNCHRONIZED WITH _PDCLIB_locale.h
\r
34 # Category to bitfield mapping
\r
36 'Lu': BIT_ALPHA | BIT_GRAPH | BIT_UPPER, # Uppercase
\r
37 'Ll': BIT_ALPHA | BIT_GRAPH | BIT_LOWER, # Lowercase
\r
38 'Lt': BIT_ALPHA | BIT_GRAPH | BIT_UPPER, # Title case. Upper?
\r
39 'Lm': BIT_ALPHA | BIT_GRAPH, # Modifier. Case?
\r
40 'Lo': BIT_ALPHA | BIT_GRAPH, # "Other" letter (e.g. Ideograph)
\r
41 'Nd': BIT_DIGIT | BIT_GRAPH, # Decimal digit
\r
42 'Nl': BIT_GRAPH, # Letter-like numeric character
\r
43 'No': BIT_GRAPH, # Other numeric
\r
44 'Pc': BIT_PUNCT | BIT_GRAPH, # Connecting punctuation
\r
45 'Pd': BIT_PUNCT | BIT_GRAPH, # Dash punctuation
\r
46 'Ps': BIT_PUNCT | BIT_GRAPH, # Opening punctuation
\r
47 'Pe': BIT_PUNCT | BIT_GRAPH, # Closing punctuation
\r
48 'Pi': BIT_PUNCT | BIT_GRAPH, # Opening quote
\r
49 'Pf': BIT_PUNCT | BIT_GRAPH, # Closing quote
\r
50 'Po': BIT_PUNCT | BIT_GRAPH, # Other punctuation
\r
51 'Sm': BIT_GRAPH, # Mathematical symbol
\r
52 'Sc': BIT_GRAPH, # Currency symbol
\r
53 'Sk': BIT_GRAPH, # Non-letterlike modifier symbol
\r
54 'So': BIT_GRAPH, # Other symbol
\r
55 'Zs': BIT_SPACE, # Non-zero-width space character
\r
56 'Zl': BIT_SPACE, # Line separator
\r
57 'Zp': BIT_SPACE, # Paragraph separator
\r
58 'Cc': BIT_CNTRL, # C0/C1 control codes
\r
61 # Characters with special properties
\r
64 0x0020: BIT_SPACE | BIT_BLANK, # space
\r
65 0x0009: BIT_SPACE | BIT_BLANK, # tab
\r
68 0x0030: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
\r
69 0x0031: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
\r
70 0x0032: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
\r
71 0x0033: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
\r
72 0x0034: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
\r
73 0x0035: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
\r
74 0x0036: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
\r
75 0x0037: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
\r
76 0x0038: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
\r
77 0x0039: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
\r
79 # A-F (hex uppercase)
\r
80 0x0041: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
\r
81 0x0042: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
\r
82 0x0043: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
\r
83 0x0044: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
\r
84 0x0045: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
\r
85 0x0046: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
\r
88 # a-f (hex lowercase)
\r
89 0x0061: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
\r
90 0x0062: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
\r
91 0x0063: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
\r
92 0x0064: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
\r
93 0x0065: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
\r
94 0x0066: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
\r
98 def __init__(self, start, flags, upper_delta, lower_delta):
\r
101 self.upper_delta = upper_delta
\r
102 self.lower_delta = lower_delta
\r
105 def add_char(self, num, label):
\r
106 self.chars.append((num, label))
\r
108 def write_to_file(self, f):
\r
109 for char in self.chars:
\r
110 f.write("// %x %s\n" % char)
\r
111 f.write(" { 0x%X, \t0x%X, \t0x%X, \t%d, \t%d },\n" %
\r
112 (self.start, len(self.chars), self.flags, self.lower_delta, self.upper_delta))
\r
115 return self.start + len(self.chars)
\r
119 def add_char(num, upper, lower, bits, label):
\r
120 upper_delta = upper - num
\r
121 lower_delta = lower - num
\r
123 if len(groups) != 0:
\r
125 if num == cur.next() and cur.flags == bits and \
\r
126 cur.upper_delta == upper_delta and \
\r
127 cur.lower_delta == lower_delta:
\r
128 cur.add_char(num, label)
\r
131 g = Group(num, bits, upper_delta, lower_delta)
\r
132 g.add_char(num, label)
\r
135 in_file = open('UnicodeData.txt', 'r')
\r
136 out_file = open('_PDCLIB_unicodedata.c', 'w')
\r
138 for line in in_file:
\r
139 (num_hex, name, category, combining_class, bidi_class, decomposition,
\r
140 numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com,
\r
141 upper_case_hex, lower_case_hex, title_case_hex) = line.split(";")
\r
143 num = int(num_hex, 16)
\r
144 upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num
\r
145 lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num
\r
146 bits = special.get(num, categories.get(category, 0))
\r
148 if upper_case == 0 and lower_case == 0 and bits == 0:
\r
151 add_char(num, upper_case, lower_case, bits, name)
\r
154 /* Unicode Character Information ** AUTOMATICALLY GENERATED FILE **
\r
156 * This file is part of the PDCLib public domain C Library, but is automatically
\r
157 * generated from the Unicode character data information file found at
\r
158 * ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
\r
160 * As a result, the licensing that applies to that file also applies to this
\r
161 * file. The licensing which applies to the Unicode character data can be found
\r
162 * in Exhibit 1 of the Unicode Terms of Use, found at
\r
163 * http://www.unicode.org/copyright.html#Exhibit1
\r
166 #include <_PDCLIB_locale.h>
\r
168 const _PDCLIB_wcinfo_t _PDCLIB_wcinfo[] = {
\r
169 // { value, \tlength, \tflags,\tlower,\tupper\t}, // name
\r
172 g.write_to_file(out_file)
\r
173 out_file.write('};\n\n')
\r
175 const size_t _PDCLIB_wcinfo_size = sizeof(_PDCLIB_wcinfo) / sizeof(_PDCLIB_wcinfo[0]);
\r
179 #include <_PDCLIB_test.h>
\r
182 return TEST_RESULTS;
\r
190 os.remove('_PDCLIB_unicodedata.c')
\r