pd.if.org Git - pdclib/blob - functions/locale/UnicodeData.py

   1 #!/usr/bin/python\r
   2 # -*- coding: <encoding name> -*-\r
   3 # Unicode Data Converter\r
   4 #\r
   5 # This file is part of the Public Domain C Library (PDCLib).\r
   6 # Permission is granted to use, modify, and / or redistribute at will.\r
   7 """\r
   8 Converts the character information provdied by Unicode in the UnicodeData.txt\r
   9 file from the Unicode character database into a table for use by PDCLib.\r
  10 """\r
  11 import os\r
  12 \r
  13 # MUST BE KEPT SYNCHRONIZED WITH _PDCLIB_locale.h\r
  14 BIT_ALPHA =   1\r
  15 BIT_BLANK =   2\r
  16 BIT_CNTRL =   4\r
  17 BIT_GRAPH =   8\r
  18 BIT_PUNCT =  16\r
  19 BIT_SPACE =  32\r
  20 BIT_LOWER =  64\r
  21 BIT_UPPER = 128\r
  22 BIT_DIGIT = 256\r
  23 \r
  24 # Category to bitfield mapping\r
  25 categories = {\r
  26     'Lu': BIT_ALPHA | BIT_GRAPH | BIT_UPPER,    # Uppercase\r
  27     'Ll': BIT_ALPHA | BIT_GRAPH | BIT_LOWER,    # Lowercase\r
  28     'Lt': BIT_ALPHA | BIT_GRAPH | BIT_UPPER,    # Title case. Upper?\r
  29     'Lm': BIT_ALPHA | BIT_GRAPH,                # Modifier. Case?\r
  30     'Lo': BIT_ALPHA | BIT_GRAPH,                # "Other" letter (e.g. Ideograph)\r
  31     'Nd': BIT_DIGIT | BIT_GRAPH,                # Decimal digit\r
  32     'Nl': BIT_GRAPH,                            # Letter-like numeric character\r
  33     'No': BIT_GRAPH,                            # Other numeric\r
  34     'Pc': BIT_PUNCT | BIT_GRAPH,                # Connecting punctuation\r
  35     'Pd': BIT_PUNCT | BIT_GRAPH,                # Dash punctuation\r
  36     'Ps': BIT_PUNCT | BIT_GRAPH,                # Opening punctuation\r
  37     'Pe': BIT_PUNCT | BIT_GRAPH,                # Closing punctuation\r
  38     'Pi': BIT_PUNCT | BIT_GRAPH,                # Opening quote\r
  39     'Pf': BIT_PUNCT | BIT_GRAPH,                # Closing quote\r
  40     'Po': BIT_PUNCT | BIT_GRAPH,                # Other punctuation\r
  41     'Sm': BIT_GRAPH,                            # Mathematical symbol\r
  42     'Sc': BIT_GRAPH,                            # Currency symbol\r
  43     'Sk': BIT_GRAPH,                            # Non-letterlike modifier symbol\r
  44     'So': BIT_GRAPH,                            # Other symbol\r
  45     'Zs': BIT_SPACE | BIT_GRAPH | BIT_BLANK,    # Non-zero-width space character\r
  46     'Zl': BIT_SPACE | BIT_GRAPH,                # Line separator\r
  47     'Zp': BIT_SPACE | BIT_GRAPH,                # Paragraph separator\r
  48     'Cc': BIT_CNTRL,                            # C0/C1 control codes\r
  49 }\r
  50 \r
  51 in_file  = open('UnicodeData.txt', 'r')\r
  52 out_file = open('_PDCLIB_unicodedata.c', 'w')\r
  53 try:\r
  54     out_file.write("""\r
  55 /* Unicode Character Information ** AUTOMATICALLY GENERATED FILE **\r
  56  *\r
  57  * This file is part of the PDCLib public domain C Library, but is automatically\r
  58  * generated from the Unicode character data information file found at\r
  59  *   ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt\r
  60  * \r
  61  * As a result, the licensing that applies to that file also applies to this \r
  62  * file. The licensing which applies to the Unicode character data can be found \r
  63  * in Exhibit 1 of the Unicode Terms of Use, found at\r
  64  *   http://www.unicode.org/copyright.html#Exhibit1\r
  65  */\r
  66  #include <_PDCLIB_locale.h>\r
  67 \r
  68  _PDCLIB_wctype_t _PDCLIB_wctype[] = {\r
  69 //   { value,\tflags,\tlower,\tupper\t}, // name\r
  70  """)\r
  71     for line in in_file:\r
  72         (num_hex, name, category, combining_class, bidi_class, decomposition,\r
  73          numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com, \r
  74          upper_case_hex, lower_case_hex, title_case_hex) = line.split(";")\r
  75 \r
  76         num       = int(num_hex, 16)\r
  77         upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num\r
  78         lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num\r
  79         bits = categories.get(category, 0)\r
  80 \r
  81         if upper_case == 0 and lower_case == 0 and bits == 0:\r
  82             continue\r
  83 \r
  84         out_file.write("    { 0x%X,\t0x%X,\t0x%X,\t0x%X }, // %s\n" % (\r
  85             num, bits, lower_case, upper_case, name))\r
  86     out_file.write('};\n\n')\r
  87     out_file.write('size_t _PDCLIB_wctype_size = sizeof(_PDCLIB_wctype) / sizeof(_PDCLIB_wctype[0]);\n\n')\r
  88 except:\r
  89     in_file.close()\r
  90     out_file.close()\r
  91     os.remove('_PDCLIB_unicodedata.c')\r
  92     raise\r
  93 else:\r
  94     in_file.close()\r
  95     out_file.close()\r