--- /dev/null
+#!/usr/bin/python\r
+# -*- coding: <encoding name> -*-\r
+# Unicode Data Converter\r
+#\r
+# This file is part of the Public Domain C Library (PDCLib).\r
+# Permission is granted to use, modify, and / or redistribute at will.\r
+"""\r
+Converts the character information provdied by Unicode in the UnicodeData.txt\r
+file from the Unicode character database into a table for use by PDCLib.\r
+"""\r
+import os\r
+\r
+# MUST BE KEPT SYNCHRONIZED WITH _PDCLIB_locale.h\r
+BIT_ALPHA = 1\r
+BIT_BLANK = 2\r
+BIT_CNTRL = 4\r
+BIT_GRAPH = 8\r
+BIT_PUNCT = 16\r
+BIT_SPACE = 32\r
+BIT_LOWER = 64\r
+BIT_UPPER = 128\r
+BIT_DIGIT = 256\r
+\r
+# Category to bitfield mapping\r
+categories = {\r
+ 'Lu': BIT_ALPHA | BIT_GRAPH | BIT_UPPER, # Uppercase\r
+ 'Ll': BIT_ALPHA | BIT_GRAPH | BIT_LOWER, # Lowercase\r
+ 'Lt': BIT_ALPHA | BIT_GRAPH | BIT_UPPER, # Title case. Upper?\r
+ 'Lm': BIT_ALPHA | BIT_GRAPH, # Modifier. Case?\r
+ 'Lo': BIT_ALPHA | BIT_GRAPH, # "Other" letter (e.g. Ideograph)\r
+ 'Nd': BIT_DIGIT | BIT_GRAPH, # Decimal digit\r
+ 'Nl': BIT_GRAPH, # Letter-like numeric character\r
+ 'No': BIT_GRAPH, # Other numeric\r
+ 'Pc': BIT_PUNCT | BIT_GRAPH, # Connecting punctuation\r
+ 'Pd': BIT_PUNCT | BIT_GRAPH, # Dash punctuation\r
+ 'Ps': BIT_PUNCT | BIT_GRAPH, # Opening punctuation\r
+ 'Pe': BIT_PUNCT | BIT_GRAPH, # Closing punctuation\r
+ 'Pi': BIT_PUNCT | BIT_GRAPH, # Opening quote\r
+ 'Pf': BIT_PUNCT | BIT_GRAPH, # Closing quote\r
+ 'Po': BIT_PUNCT | BIT_GRAPH, # Other punctuation\r
+ 'Sm': BIT_GRAPH, # Mathematical symbol\r
+ 'Sc': BIT_GRAPH, # Currency symbol\r
+ 'Sk': BIT_GRAPH, # Non-letterlike modifier symbol\r
+ 'So': BIT_GRAPH, # Other symbol\r
+ 'Zs': BIT_SPACE | BIT_GRAPH | BIT_BLANK, # Non-zero-width space character\r
+ 'Zl': BIT_SPACE | BIT_GRAPH, # Line separator\r
+ 'Zp': BIT_SPACE | BIT_GRAPH, # Paragraph separator\r
+ 'Cc': BIT_CNTRL, # C0/C1 control codes\r
+}\r
+\r
+in_file = open('UnicodeData.txt', 'r')\r
+out_file = open('_PDCLIB_unicodedata.c', 'w')\r
+try:\r
+ out_file.write("""\r
+/* Unicode Character Information ** AUTOMATICALLY GENERATED FILE **\r
+ *\r
+ * This file is part of the PDCLib public domain C Library, but is automatically\r
+ * generated from the Unicode character data information file found at\r
+ * ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt\r
+ * \r
+ * As a result, the licensing that applies to that file also applies to this \r
+ * file. The licensing which applies to the Unicode character data can be found \r
+ * in Exhibit 1 of the Unicode Terms of Use, found at\r
+ * http://www.unicode.org/copyright.html#Exhibit1\r
+ */\r
+ #include <_PDCLIB_locale.h>\r
+\r
+ _PDCLIB_wctype_t _PDCLIB_wctype[] = {\r
+// { value,\tflags,\tlower,\tupper\t}, // name\r
+ """)\r
+ for line in in_file:\r
+ (num_hex, name, category, combining_class, bidi_class, decomposition,\r
+ numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com, \r
+ upper_case_hex, lower_case_hex, title_case_hex) = line.split(";")\r
+\r
+ num = int(num_hex, 16)\r
+ upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num\r
+ lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num\r
+ bits = categories.get(category, 0)\r
+\r
+ if upper_case == 0 and lower_case == 0 and bits == 0:\r
+ continue\r
+\r
+ out_file.write(" { 0x%X,\t0x%X,\t0x%X,\t0x%X }, // %s\n" % (\r
+ num, bits, lower_case, upper_case, name))\r
+ out_file.write('};\n\n')\r
+ out_file.write('size_t _PDCLIB_wctype_size = sizeof(_PDCLIB_wctype) / sizeof(_PDCLIB_wctype[0]);\n\n')\r
+except:\r
+ in_file.close()\r
+ out_file.close()\r
+ os.remove('_PDCLIB_unicodedata.c')\r
+ raise\r
+else:\r
+ in_file.close()\r
+ out_file.close()\r