From c7496dafe5261c2f0d0c4becc036ac9c9e5f2b64 Mon Sep 17 00:00:00 2001 From: Owen Shepherd Date: Sat, 16 Mar 2013 18:54:03 +0000 Subject: [PATCH] PDCLIB-3 correct classification of space characters --- functions/locale/UnicodeData.py | 10 ++++-- functions/locale/_PDCLIB_unicodedata.c | 42 +++++++++++++------------- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/functions/locale/UnicodeData.py b/functions/locale/UnicodeData.py index e31ec2e..6fe74cb 100644 --- a/functions/locale/UnicodeData.py +++ b/functions/locale/UnicodeData.py @@ -49,14 +49,18 @@ categories = { 'Sc': BIT_GRAPH, # Currency symbol 'Sk': BIT_GRAPH, # Non-letterlike modifier symbol 'So': BIT_GRAPH, # Other symbol - 'Zs': BIT_SPACE | BIT_GRAPH | BIT_BLANK, # Non-zero-width space character - 'Zl': BIT_SPACE | BIT_GRAPH, # Line separator - 'Zp': BIT_SPACE | BIT_GRAPH, # Paragraph separator + 'Zs': BIT_SPACE, # Non-zero-width space character + 'Zl': BIT_SPACE, # Line separator + 'Zp': BIT_SPACE, # Paragraph separator 'Cc': BIT_CNTRL, # C0/C1 control codes } # Characters with special properties special = { + # Blank characters + 0x0020: BIT_SPACE | BIT_BLANK, # space + 0x0009: BIT_SPACE | BIT_BLANK, # tab + # Digits 0x0030: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, 0x0031: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH, diff --git a/functions/locale/_PDCLIB_unicodedata.c b/functions/locale/_PDCLIB_unicodedata.c index 928226c..dbfc02a 100644 --- a/functions/locale/_PDCLIB_unicodedata.c +++ b/functions/locale/_PDCLIB_unicodedata.c @@ -24,7 +24,7 @@ { 0x6, 0x4, 0x6, 0x6 }, // { 0x7, 0x4, 0x7, 0x7 }, // { 0x8, 0x4, 0x8, 0x8 }, // - { 0x9, 0x4, 0x9, 0x9 }, // + { 0x9, 0x22, 0x9, 0x9 }, // { 0xA, 0x4, 0xA, 0xA }, // { 0xB, 0x4, 0xB, 0xB }, // { 0xC, 0x4, 0xC, 0xC }, // @@ -47,7 +47,7 @@ { 0x1D, 0x4, 0x1D, 0x1D }, // { 0x1E, 0x4, 0x1E, 0x1E }, // { 0x1F, 0x4, 0x1F, 0x1F }, // - { 0x20, 0x2A, 0x20, 0x20 }, // SPACE + { 0x20, 0x22, 0x20, 0x20 }, // SPACE { 0x21, 0x18, 0x21, 0x21 }, // EXCLAMATION MARK { 0x22, 0x18, 0x22, 0x22 }, // QUOTATION MARK { 0x23, 0x18, 0x23, 0x23 }, // NUMBER SIGN @@ -175,7 +175,7 @@ { 0x9D, 0x4, 0x9D, 0x9D }, // { 0x9E, 0x4, 0x9E, 0x9E }, // { 0x9F, 0x4, 0x9F, 0x9F }, // - { 0xA0, 0x2A, 0xA0, 0xA0 }, // NO-BREAK SPACE + { 0xA0, 0x20, 0xA0, 0xA0 }, // NO-BREAK SPACE { 0xA1, 0x18, 0xA1, 0xA1 }, // INVERTED EXCLAMATION MARK { 0xA2, 0x8, 0xA2, 0xA2 }, // CENT SIGN { 0xA3, 0x8, 0xA3, 0xA3 }, // POUND SIGN @@ -4994,7 +4994,7 @@ { 0x167D, 0x9, 0x167D, 0x167D }, // CANADIAN SYLLABICS WOODS-CREE THWAA { 0x167E, 0x9, 0x167E, 0x167E }, // CANADIAN SYLLABICS WOODS-CREE FINAL TH { 0x167F, 0x9, 0x167F, 0x167F }, // CANADIAN SYLLABICS BLACKFOOT W - { 0x1680, 0x2A, 0x1680, 0x1680 }, // OGHAM SPACE MARK + { 0x1680, 0x20, 0x1680, 0x1680 }, // OGHAM SPACE MARK { 0x1681, 0x9, 0x1681, 0x1681 }, // OGHAM LETTER BEITH { 0x1682, 0x9, 0x1682, 0x1682 }, // OGHAM LETTER LUIS { 0x1683, 0x9, 0x1683, 0x1683 }, // OGHAM LETTER FEARN @@ -5313,7 +5313,7 @@ { 0x180B, 0x0, 0x180B, 0x180B }, // MONGOLIAN FREE VARIATION SELECTOR ONE { 0x180C, 0x0, 0x180C, 0x180C }, // MONGOLIAN FREE VARIATION SELECTOR TWO { 0x180D, 0x0, 0x180D, 0x180D }, // MONGOLIAN FREE VARIATION SELECTOR THREE - { 0x180E, 0x2A, 0x180E, 0x180E }, // MONGOLIAN VOWEL SEPARATOR + { 0x180E, 0x20, 0x180E, 0x180E }, // MONGOLIAN VOWEL SEPARATOR { 0x1810, 0x108, 0x1810, 0x1810 }, // MONGOLIAN DIGIT ZERO { 0x1811, 0x108, 0x1811, 0x1811 }, // MONGOLIAN DIGIT ONE { 0x1812, 0x108, 0x1812, 0x1812 }, // MONGOLIAN DIGIT TWO @@ -7032,17 +7032,17 @@ { 0x1FFC, 0x89, 0x1FF3, 0x1FFC }, // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI { 0x1FFD, 0x8, 0x1FFD, 0x1FFD }, // GREEK OXIA { 0x1FFE, 0x8, 0x1FFE, 0x1FFE }, // GREEK DASIA - { 0x2000, 0x2A, 0x2000, 0x2000 }, // EN QUAD - { 0x2001, 0x2A, 0x2001, 0x2001 }, // EM QUAD - { 0x2002, 0x2A, 0x2002, 0x2002 }, // EN SPACE - { 0x2003, 0x2A, 0x2003, 0x2003 }, // EM SPACE - { 0x2004, 0x2A, 0x2004, 0x2004 }, // THREE-PER-EM SPACE - { 0x2005, 0x2A, 0x2005, 0x2005 }, // FOUR-PER-EM SPACE - { 0x2006, 0x2A, 0x2006, 0x2006 }, // SIX-PER-EM SPACE - { 0x2007, 0x2A, 0x2007, 0x2007 }, // FIGURE SPACE - { 0x2008, 0x2A, 0x2008, 0x2008 }, // PUNCTUATION SPACE - { 0x2009, 0x2A, 0x2009, 0x2009 }, // THIN SPACE - { 0x200A, 0x2A, 0x200A, 0x200A }, // HAIR SPACE + { 0x2000, 0x20, 0x2000, 0x2000 }, // EN QUAD + { 0x2001, 0x20, 0x2001, 0x2001 }, // EM QUAD + { 0x2002, 0x20, 0x2002, 0x2002 }, // EN SPACE + { 0x2003, 0x20, 0x2003, 0x2003 }, // EM SPACE + { 0x2004, 0x20, 0x2004, 0x2004 }, // THREE-PER-EM SPACE + { 0x2005, 0x20, 0x2005, 0x2005 }, // FOUR-PER-EM SPACE + { 0x2006, 0x20, 0x2006, 0x2006 }, // SIX-PER-EM SPACE + { 0x2007, 0x20, 0x2007, 0x2007 }, // FIGURE SPACE + { 0x2008, 0x20, 0x2008, 0x2008 }, // PUNCTUATION SPACE + { 0x2009, 0x20, 0x2009, 0x2009 }, // THIN SPACE + { 0x200A, 0x20, 0x200A, 0x200A }, // HAIR SPACE { 0x200B, 0x0, 0x200B, 0x200B }, // ZERO WIDTH SPACE { 0x200C, 0x0, 0x200C, 0x200C }, // ZERO WIDTH NON-JOINER { 0x200D, 0x0, 0x200D, 0x200D }, // ZERO WIDTH JOINER @@ -7072,14 +7072,14 @@ { 0x2025, 0x18, 0x2025, 0x2025 }, // TWO DOT LEADER { 0x2026, 0x18, 0x2026, 0x2026 }, // HORIZONTAL ELLIPSIS { 0x2027, 0x18, 0x2027, 0x2027 }, // HYPHENATION POINT - { 0x2028, 0x28, 0x2028, 0x2028 }, // LINE SEPARATOR - { 0x2029, 0x28, 0x2029, 0x2029 }, // PARAGRAPH SEPARATOR + { 0x2028, 0x20, 0x2028, 0x2028 }, // LINE SEPARATOR + { 0x2029, 0x20, 0x2029, 0x2029 }, // PARAGRAPH SEPARATOR { 0x202A, 0x0, 0x202A, 0x202A }, // LEFT-TO-RIGHT EMBEDDING { 0x202B, 0x0, 0x202B, 0x202B }, // RIGHT-TO-LEFT EMBEDDING { 0x202C, 0x0, 0x202C, 0x202C }, // POP DIRECTIONAL FORMATTING { 0x202D, 0x0, 0x202D, 0x202D }, // LEFT-TO-RIGHT OVERRIDE { 0x202E, 0x0, 0x202E, 0x202E }, // RIGHT-TO-LEFT OVERRIDE - { 0x202F, 0x2A, 0x202F, 0x202F }, // NARROW NO-BREAK SPACE + { 0x202F, 0x20, 0x202F, 0x202F }, // NARROW NO-BREAK SPACE { 0x2030, 0x18, 0x2030, 0x2030 }, // PER MILLE SIGN { 0x2031, 0x18, 0x2031, 0x2031 }, // PER TEN THOUSAND SIGN { 0x2032, 0x18, 0x2032, 0x2032 }, // PRIME @@ -7127,7 +7127,7 @@ { 0x205C, 0x18, 0x205C, 0x205C }, // DOTTED CROSS { 0x205D, 0x18, 0x205D, 0x205D }, // TRICOLON { 0x205E, 0x18, 0x205E, 0x205E }, // VERTICAL FOUR DOTS - { 0x205F, 0x2A, 0x205F, 0x205F }, // MEDIUM MATHEMATICAL SPACE + { 0x205F, 0x20, 0x205F, 0x205F }, // MEDIUM MATHEMATICAL SPACE { 0x2060, 0x0, 0x2060, 0x2060 }, // WORD JOINER { 0x2061, 0x0, 0x2061, 0x2061 }, // FUNCTION APPLICATION { 0x2062, 0x0, 0x2062, 0x2062 }, // INVISIBLE TIMES @@ -10683,7 +10683,7 @@ { 0x2FF9, 0x8, 0x2FF9, 0x2FF9 }, // IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER RIGHT { 0x2FFA, 0x8, 0x2FFA, 0x2FFA }, // IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER LEFT { 0x2FFB, 0x8, 0x2FFB, 0x2FFB }, // IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID - { 0x3000, 0x2A, 0x3000, 0x3000 }, // IDEOGRAPHIC SPACE + { 0x3000, 0x20, 0x3000, 0x3000 }, // IDEOGRAPHIC SPACE { 0x3001, 0x18, 0x3001, 0x3001 }, // IDEOGRAPHIC COMMA { 0x3002, 0x18, 0x3002, 0x3002 }, // IDEOGRAPHIC FULL STOP { 0x3003, 0x18, 0x3003, 0x3003 }, // DITTO MARK -- 2.40.0