error detected at """ ch in unicode.C """ Library error?

Charles Hixson via Digitalmars-d-learn digitalmars-d-learn at puremagic.com
Thu Oct 22 18:44:13 PDT 2015



On 10/21/2015 06:21 PM, Charles Hixson via Digitalmars-d-learn wrote:
> To me this looks like a library error, but I'm not sure.  Any suggestions
> import    std.uni;
>
> char    gcCat1    (dchar ch)
> {  if    (ch in unicode.L)        return    'L';        // Letter
>     if    (ch in unicode.M)        return    'M';        // Mask
>     if    (ch in unicode.C)        return    'C';        // Control    
> <<== error here!
>     if    (ch in unicode.N)        return    'N';        // Numeric
>     if    (ch in unicode.P)        return    'P';        // Punctuation
>     if    (ch in unicode.S)        return    'S';        // Symbol
>     if    (ch in unicode.Z)        return    'Z';        // Separator
>
>     return    '?';
> }
>
> $ rdmd --main -unittest test2.d
> /usr/include/dmd/phobos/std/uni.d(6220): Error: slice [0..2] exceeds 
> array bounds [0..1]
> /usr/include/dmd/phobos/std/uni.d(6220):        called from here: 
> comparePropertyName(name[0..2], "In")
> /usr/include/dmd/phobos/std/uni.d(6119):        called from here: 
> findAny("C")
> /usr/include/dmd/phobos/std/uni.d(6122): Error: static assert  "No 
> unicode set by name C was found."
> test2.d(7):        instantiated from here: opDispatch!"C"
> Failed: ["dmd", "-unittest", "-v", "-o-", "test2.d", "-I."]
>
>
> $ dmd
> DMD64 D Compiler v2.068.2
> Copyright (c) 1999-2015 by Digital Mars written by Walter Bright
> Documentation: http://dlang.org/
> Config file: /etc/dmd.conf
> ...
>
Here is a routine for general character category that works except for 
control characters, which appear to be mishandled by the library.  (If 
there's another explanation, I'd like to hear it.)
Note:
The unittests are excessively verbose, but the way to silence them is 
pretty obvious, except in the cases that fail.  This is NOT a 
comprehensive unittest, so there may be errors not currently detected.

Also, if there's a better way to do this, I'd like to hear it.

import   std.uni;

/**   Return a character approximately equivalent to the first character
  *    of the Unicode General Category Classification.
  * Warning:  This method has been specialized for English and related
  * languages.  (That's what category Q is about.  It has also been
  * adapted to consider the characters often used in SSI#s, phone #s,
  * dates, and times as embeddable.)
  * Returns: L, Z, C, Q, P, or ?.  Q is for embeddable chars. Numeric has
  *    been combined with Alpha, and ? is not otherwise classifiable. */
char  charCat (dchar ch)
{  if (isAlpha (ch) )         return   'L';
    if (isNumber (ch) )        return   'L';  // I don't want to 
distinguish between letters and numbers
    if (isWhite (ch) )         return   'Z';
    if (isControl (ch) )       return   'C';
// if ("'-+.,@/’‘:/".indexOf (ch) >= 0)         return   'Q';  // Not a 
unicode grouping
    if (isPunctuation (ch) )   return   'P';
    else                       return   '?';  // Includes not a character
}

char  gcCat1   (dchar ch)
{  if (ch in unicode.L)    return   'L';     // Letter
    if (ch in unicode.M)    return   'M';     // Mask
// if (ch in unicode.C)    return   'C';     // Control
    if (isControl(ch) )     return   'C';     // Control
    if (ch in unicode.N)    return   'N';     // Numeric
    if (ch in unicode.P)    return   'P';     // Punctuation
    if (ch in unicode.S)    return   'S';     // Symbol
    if (ch in unicode.Z)    return   'Z';     // Separator

    return   '?';
}
/**   Get the two letter general character category.  */
string   gcCat2 (dchar ch)
{  char  kind  =  gcCat1(ch);
    switch (kind)
    {  case  'C':     // C  Other
          //    Cc    Control
          if (ch in unicode.Cc)               return   "Cc";
          // Cf    Format
          if (ch in unicode.Cf)               return   "Cf";
          // Cn    Unassigned
          if (ch in unicode.Cn)               return   "Cn";
          // Co    Private_Use
          if (ch in unicode.Co)               return   "Co";
          // Cs    Surrogate
          if (ch in unicode.Cs)               return   "Cs";
          // Unexpected value
          return   "C?";
       case  'L':     // L  Letter
          // Ll    Lowercase_Letter
          if (ch in unicode.Ll)               return   "Ll";
          // Lm    Modifier_Letter
          if (ch in unicode.Lm)               return   "Lm";
          // Lo    Other_Letter
          if (ch in unicode.Lo)               return   "Lo";
          // Lt    Titlecase_Letter
          if (ch in unicode.Lt)               return   "Lt";
          // Lu    Uppercase_Letter
          if (ch in unicode.Lu)               return   "Lu";
          // Unexpected Letter
          return   "L?";
       case  'M':     // M  Mark
          // Mc    Spacing_Mark
          if (ch in unicode.Mc)               return   "Mc";
          // Me    Enclosing_Mark
          if (ch in unicode.Me)               return   "Me";
          // Mn    Nonspacing_Mark
          if (ch in unicode.Mn)               return   "Mn";
          // Unexpected Mark
          return   "M?";
       case  'N':     // N  Number
          // Nd    Decimal_Number
          if (ch in unicode.Nd)               return   "Nd";
          // Nl    Letter_Number
          if (ch in unicode.Nl)               return   "Nl";
          // No    Other_Number
          if (ch in unicode.No)               return   "No";
          // Unexpected Number
          return   "N?";
       case  'P':     // P  Punctuation
          // Pc    Connector_Punctuation
          if (ch in unicode.Pc)               return   "Pc";
          // Pd    Dash_Punctuation
          if (ch in unicode.Pd)               return   "Pd";
          // Pe    Close_Punctuation
          if (ch in unicode.Pe)               return   "Pe";
          // Pf    Final_Punctuation
          if (ch in unicode.Pf)               return   "Pf";
          // Pi    Initial_Punctuation
          if (ch in unicode.Pi)               return   "Pi";
          // Po    Other_Punctuation
          if (ch in unicode.Po)               return   "Po";
          // Ps    Open_Punctuation
          if (ch in unicode.Ps)               return   "Ps";
          // Unexpected Punctuation
          return   "P?";
       case  'S':     // S  Symbol
          // Sc    Currency_Symbol
          if (ch in unicode.Sc)               return   "Sc";
          // Sk    Modifier_Symbol
          if (ch in unicode.Sk)               return   "Sk";
          // Sm    Math_Symbol
          if (ch in unicode.Sm)               return   "Sm";
          // So    Other_Symbol
          if (ch in unicode.So)               return   "So";
          // Unexpected Symbol
          return   "S?";
       case  'Z':     // Z  Separator
          // Zl    Line_Separator
          if (ch in unicode.Zl)               return   "Zl";
          // Zp    Paragraph_Separator
          if (ch in unicode.Zp)               return   "Zp";
          // Zs    Space_Separator
          if (ch in unicode.Zs)               return   "Zs";
          // Unexpected Separator
          return   "z?";
       default:
          // Unexpected Kind
          return   "??";
    }
}  // string   gcCat2 (dchar ch)
unittest
{
    writeln ("\\a == ", gcCat2 ('\a') );      // Cc
    writeln ("\\n == ", gcCat2 ('\n') );      // Cc
    writeln ("\\r == ", gcCat2 ('\r') );      // Cc
    writeln ("\\t == ", gcCat2 ('\t') );      // Cc
    writeln ("\\b == ", gcCat2 ('\b') );      // Cc
    writeln ("u00AD (SHY) == ", gcCat2 ('\u00AD'), " <<== FAIL, should 
be \"Cf\"");     // Cf
    writeln ("u0600 == ", gcCat2 ('\u0600'), " <<== FAIL, should be 
\"Cf\"");     // Cf
    writeln ("U000E007F == ", gcCat2 ('\U000E007F'), " <<== FAIL, should 
be \"Cf\"");      // Cf
    writeln ("uD800 == ", gcCat2 (0xD800), " <<== FAIL, should be 
\"Co\"");    // Co
    writeln ("uDB7F == ", gcCat2 (0xDB7F), " <<== FAIL, should be 
\"Co\"");    // Co
    writeln ("a == ", gcCat2 ('a') );         // Ll
    writeln ("ʰ == ", gcCat2 ('ʰ') );         // Lm
    writeln ("ª == ", gcCat2 ('ª') );         // Lo
    writeln ("Dž == ", gcCat2 ('Dž') );         // Lt
    writeln ("A == ", gcCat2 ('A') );         // Lu
    writeln (" ः == ", gcCat2 ('ः') );        // Mc
    writeln ("   ⃤ == ", gcCat2 ('⃤') );         // Me
    writeln ("u065e == ", gcCat2 (0x65e));       // Mn
    writeln ("۶ == ", gcCat2 ('۶') );         // Nd
    writeln ("0 == ", gcCat2 ('0') );         // Nd
    writeln ("ᛯ == ", gcCat2 ('ᛯ') );         // Nl
    writeln ("¼ == ", gcCat2 ('¼') );         // No
    writeln ("_ == ", gcCat2 ('_') );         // Pc
    writeln ("- == ", gcCat2 ('-') );         // Pd
    writeln (") == ", gcCat2 (')') );         // Pe
    writeln ("] == ", gcCat2 (']') );         // Pe
    writeln ("} == ", gcCat2 ('}') );         // Pe
    writeln ("» == ", gcCat2 ('»') );         // Pf
    writeln ("« == ", gcCat2 ('«') );         // Pi
    writeln ("@ == ", gcCat2 ('@') );         // Po
    writeln (". == ", gcCat2 ('.') );         // Po
    writeln ("\" == ", gcCat2 ('"') );        // Po
    writeln ("{ == ", gcCat2 ('{') );         // Ps
    writeln ("[ == ", gcCat2 ('[') );         // Ps
    writeln ("( == ", gcCat2 ('(') );         // Ps
    writeln ("$ == ", gcCat2 ('$') );         // Sc
    writeln ("^ == ", gcCat2 ('^') );         // Sk
    writeln ("+ == ", gcCat2 ('+') );         // Sm
    writeln ("~ == ", gcCat2 ('~') );         // Sm
    writeln ("= == ", gcCat2 ('=') );         // Sm
    writeln ("© == ", gcCat2 ('©') );         // So
    writeln ("u2028 == ", gcCat2 (0x2028) );        // Zl
    writeln ("u2029 == ", gcCat2 (0x2029) );        // Zp
    writeln ("  == ", gcCat2 (' ') );         // Zs
}



More information about the Digitalmars-d-learn mailing list