From 36aeca966a42f071466086ddaa4f50e9b9b51c3f Mon Sep 17 00:00:00 2001 From: Owen Shepherd Date: Sat, 16 Mar 2013 18:16:26 +0000 Subject: [PATCH] PDCLIB-3 Add XDIGIT to list of bits in Unicode character data --- functions/locale/UnicodeData.py | 35 +++++++++++++++++++- functions/locale/_PDCLIB_unicodedata.c | 44 +++++++++++++------------- 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/functions/locale/UnicodeData.py b/functions/locale/UnicodeData.py index 42a8f9f..e31ec2e 100644 --- a/functions/locale/UnicodeData.py +++ b/functions/locale/UnicodeData.py @@ -26,6 +26,7 @@ BIT_SPACE = 32 BIT_LOWER = 64 BIT_UPPER = 128 BIT_DIGIT = 256 +BIT_XDIGT = 512 # Category to bitfield mapping categories = { @@ -54,6 +55,38 @@ categories = { 'Cc': BIT_CNTRL, # C0/C1 control codes } +# Characters with special properties +special = { + # Digits + 0x0030: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0031: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0032: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0033: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0034: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0035: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0036: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0037: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0038: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + 0x0039: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, + + # A-F (hex uppercase) + 0x0041: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0042: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0043: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0044: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0045: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + 0x0046: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER, + + + # a-f (hex lowercase) + 0x0061: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0062: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0063: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0064: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0065: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, + 0x0066: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER, +} + in_file = open('UnicodeData.txt', 'r') out_file = open('_PDCLIB_unicodedata.c', 'w') try: @@ -83,7 +116,7 @@ try: num = int(num_hex, 16) upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num - bits = categories.get(category, 0) + bits = special.get(num, categories.get(category, 0)) if upper_case == 0 and lower_case == 0 and bits == 0: continue diff --git a/functions/locale/_PDCLIB_unicodedata.c b/functions/locale/_PDCLIB_unicodedata.c index e4e9c83..928226c 100644 --- a/functions/locale/_PDCLIB_unicodedata.c +++ b/functions/locale/_PDCLIB_unicodedata.c @@ -63,16 +63,16 @@ { 0x2D, 0x18, 0x2D, 0x2D }, // HYPHEN-MINUS { 0x2E, 0x18, 0x2E, 0x2E }, // FULL STOP { 0x2F, 0x18, 0x2F, 0x2F }, // SOLIDUS - { 0x30, 0x108, 0x30, 0x30 }, // DIGIT ZERO - { 0x31, 0x108, 0x31, 0x31 }, // DIGIT ONE - { 0x32, 0x108, 0x32, 0x32 }, // DIGIT TWO - { 0x33, 0x108, 0x33, 0x33 }, // DIGIT THREE - { 0x34, 0x108, 0x34, 0x34 }, // DIGIT FOUR - { 0x35, 0x108, 0x35, 0x35 }, // DIGIT FIVE - { 0x36, 0x108, 0x36, 0x36 }, // DIGIT SIX - { 0x37, 0x108, 0x37, 0x37 }, // DIGIT SEVEN - { 0x38, 0x108, 0x38, 0x38 }, // DIGIT EIGHT - { 0x39, 0x108, 0x39, 0x39 }, // DIGIT NINE + { 0x30, 0x308, 0x30, 0x30 }, // DIGIT ZERO + { 0x31, 0x308, 0x31, 0x31 }, // DIGIT ONE + { 0x32, 0x308, 0x32, 0x32 }, // DIGIT TWO + { 0x33, 0x308, 0x33, 0x33 }, // DIGIT THREE + { 0x34, 0x308, 0x34, 0x34 }, // DIGIT FOUR + { 0x35, 0x308, 0x35, 0x35 }, // DIGIT FIVE + { 0x36, 0x308, 0x36, 0x36 }, // DIGIT SIX + { 0x37, 0x308, 0x37, 0x37 }, // DIGIT SEVEN + { 0x38, 0x308, 0x38, 0x38 }, // DIGIT EIGHT + { 0x39, 0x308, 0x39, 0x39 }, // DIGIT NINE { 0x3A, 0x18, 0x3A, 0x3A }, // COLON { 0x3B, 0x18, 0x3B, 0x3B }, // SEMICOLON { 0x3C, 0x8, 0x3C, 0x3C }, // LESS-THAN SIGN @@ -80,12 +80,12 @@ { 0x3E, 0x8, 0x3E, 0x3E }, // GREATER-THAN SIGN { 0x3F, 0x18, 0x3F, 0x3F }, // QUESTION MARK { 0x40, 0x18, 0x40, 0x40 }, // COMMERCIAL AT - { 0x41, 0x89, 0x61, 0x41 }, // LATIN CAPITAL LETTER A - { 0x42, 0x89, 0x62, 0x42 }, // LATIN CAPITAL LETTER B - { 0x43, 0x89, 0x63, 0x43 }, // LATIN CAPITAL LETTER C - { 0x44, 0x89, 0x64, 0x44 }, // LATIN CAPITAL LETTER D - { 0x45, 0x89, 0x65, 0x45 }, // LATIN CAPITAL LETTER E - { 0x46, 0x89, 0x66, 0x46 }, // LATIN CAPITAL LETTER F + { 0x41, 0x289, 0x61, 0x41 }, // LATIN CAPITAL LETTER A + { 0x42, 0x289, 0x62, 0x42 }, // LATIN CAPITAL LETTER B + { 0x43, 0x289, 0x63, 0x43 }, // LATIN CAPITAL LETTER C + { 0x44, 0x289, 0x64, 0x44 }, // LATIN CAPITAL LETTER D + { 0x45, 0x289, 0x65, 0x45 }, // LATIN CAPITAL LETTER E + { 0x46, 0x289, 0x66, 0x46 }, // LATIN CAPITAL LETTER F { 0x47, 0x89, 0x67, 0x47 }, // LATIN CAPITAL LETTER G { 0x48, 0x89, 0x68, 0x48 }, // LATIN CAPITAL LETTER H { 0x49, 0x89, 0x69, 0x49 }, // LATIN CAPITAL LETTER I @@ -112,12 +112,12 @@ { 0x5E, 0x8, 0x5E, 0x5E }, // CIRCUMFLEX ACCENT { 0x5F, 0x18, 0x5F, 0x5F }, // LOW LINE { 0x60, 0x8, 0x60, 0x60 }, // GRAVE ACCENT - { 0x61, 0x49, 0x61, 0x41 }, // LATIN SMALL LETTER A - { 0x62, 0x49, 0x62, 0x42 }, // LATIN SMALL LETTER B - { 0x63, 0x49, 0x63, 0x43 }, // LATIN SMALL LETTER C - { 0x64, 0x49, 0x64, 0x44 }, // LATIN SMALL LETTER D - { 0x65, 0x49, 0x65, 0x45 }, // LATIN SMALL LETTER E - { 0x66, 0x49, 0x66, 0x46 }, // LATIN SMALL LETTER F + { 0x61, 0x249, 0x61, 0x41 }, // LATIN SMALL LETTER A + { 0x62, 0x249, 0x62, 0x42 }, // LATIN SMALL LETTER B + { 0x63, 0x249, 0x63, 0x43 }, // LATIN SMALL LETTER C + { 0x64, 0x249, 0x64, 0x44 }, // LATIN SMALL LETTER D + { 0x65, 0x249, 0x65, 0x45 }, // LATIN SMALL LETTER E + { 0x66, 0x249, 0x66, 0x46 }, // LATIN SMALL LETTER F { 0x67, 0x49, 0x67, 0x47 }, // LATIN SMALL LETTER G { 0x68, 0x49, 0x68, 0x48 }, // LATIN SMALL LETTER H { 0x69, 0x49, 0x69, 0x49 }, // LATIN SMALL LETTER I -- 2.40.0