2 # -*- coding: <encoding name> -*-
\r
3 # Unicode Data Converter
\r
5 # This file is part of the Public Domain C Library (PDCLib).
\r
6 # Permission is granted to use, modify, and / or redistribute at will.
\r
8 Converts the character information provdied by Unicode in the UnicodeData.txt
\r
9 file from the Unicode character database into a table for use by PDCLib.
\r
13 # MUST BE KEPT SYNCHRONIZED WITH _PDCLIB_locale.h
\r
24 # Category to bitfield mapping
\r
26 'Lu': BIT_ALPHA | BIT_GRAPH | BIT_UPPER, # Uppercase
\r
27 'Ll': BIT_ALPHA | BIT_GRAPH | BIT_LOWER, # Lowercase
\r
28 'Lt': BIT_ALPHA | BIT_GRAPH | BIT_UPPER, # Title case. Upper?
\r
29 'Lm': BIT_ALPHA | BIT_GRAPH, # Modifier. Case?
\r
30 'Lo': BIT_ALPHA | BIT_GRAPH, # "Other" letter (e.g. Ideograph)
\r
31 'Nd': BIT_DIGIT | BIT_GRAPH, # Decimal digit
\r
32 'Nl': BIT_GRAPH, # Letter-like numeric character
\r
33 'No': BIT_GRAPH, # Other numeric
\r
34 'Pc': BIT_PUNCT | BIT_GRAPH, # Connecting punctuation
\r
35 'Pd': BIT_PUNCT | BIT_GRAPH, # Dash punctuation
\r
36 'Ps': BIT_PUNCT | BIT_GRAPH, # Opening punctuation
\r
37 'Pe': BIT_PUNCT | BIT_GRAPH, # Closing punctuation
\r
38 'Pi': BIT_PUNCT | BIT_GRAPH, # Opening quote
\r
39 'Pf': BIT_PUNCT | BIT_GRAPH, # Closing quote
\r
40 'Po': BIT_PUNCT | BIT_GRAPH, # Other punctuation
\r
41 'Sm': BIT_GRAPH, # Mathematical symbol
\r
42 'Sc': BIT_GRAPH, # Currency symbol
\r
43 'Sk': BIT_GRAPH, # Non-letterlike modifier symbol
\r
44 'So': BIT_GRAPH, # Other symbol
\r
45 'Zs': BIT_SPACE | BIT_GRAPH | BIT_BLANK, # Non-zero-width space character
\r
46 'Zl': BIT_SPACE | BIT_GRAPH, # Line separator
\r
47 'Zp': BIT_SPACE | BIT_GRAPH, # Paragraph separator
\r
48 'Cc': BIT_CNTRL, # C0/C1 control codes
\r
51 in_file = open('UnicodeData.txt', 'r')
\r
52 out_file = open('_PDCLIB_unicodedata.c', 'w')
\r
55 /* Unicode Character Information ** AUTOMATICALLY GENERATED FILE **
\r
57 * This file is part of the PDCLib public domain C Library, but is automatically
\r
58 * generated from the Unicode character data information file found at
\r
59 * ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
\r
61 * As a result, the licensing that applies to that file also applies to this
\r
62 * file. The licensing which applies to the Unicode character data can be found
\r
63 * in Exhibit 1 of the Unicode Terms of Use, found at
\r
64 * http://www.unicode.org/copyright.html#Exhibit1
\r
66 #include <_PDCLIB_locale.h>
\r
68 _PDCLIB_wctype_t _PDCLIB_wctype[] = {
\r
69 // { value,\tflags,\tlower,\tupper\t}, // name
\r
71 for line in in_file:
\r
72 (num_hex, name, category, combining_class, bidi_class, decomposition,
\r
73 numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com,
\r
74 upper_case_hex, lower_case_hex, title_case_hex) = line.split(";")
\r
76 num = int(num_hex, 16)
\r
77 upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num
\r
78 lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num
\r
79 bits = categories.get(category, 0)
\r
81 if upper_case == 0 and lower_case == 0 and bits == 0:
\r
84 out_file.write(" { 0x%X,\t0x%X,\t0x%X,\t0x%X }, // %s\n" % (
\r
85 num, bits, lower_case, upper_case, name))
\r
86 out_file.write('};\n\n')
\r
87 out_file.write('size_t _PDCLIB_wctype_size = sizeof(_PDCLIB_wctype) / sizeof(_PDCLIB_wctype[0]);\n\n')
\r
91 os.remove('_PDCLIB_unicodedata.c')
\r