Reducing the cost of autodecoding

Patrick Schluter via Digitalmars-d digitalmars-d at puremagic.com
Sun Oct 16 00:59:16 PDT 2016


Here my version. It's probably not the shortest (100 ligns of 
assembly with LDC) but it is correct and has following properties:
- Performance proportional to the encoding length
- Detects Invalid byte sequences
- Detects Overlong encodings
- Detects Invalid code points

I put the exception to be comparable to other routines but 
Unicode specifies that it is preferable to not abort on encoding 
errors (to avoid denial of service attacks).

dchar myFront2(ref char[] str)
{
   dchar c0 = str.ptr[0];
   if(c0 < 0x80) {
     return c0;
   }
   else if(str.length > 1) {
     dchar c1 = str.ptr[1];
     if(c0 < 0xE0 && (c1 & 0xC0) == 0x80) {
       c1 = ((c0 & 0x1F) << 6)|(c1 & 0x3F);
       if(c1 < 0x80) goto Linvalid;
       return c1;
     }
     else if(str.length > 2) {
       dchar c2 = str.ptr[2];
       if(c0 < 0xF0 && (c1 & 0xC0) == 0x80 && (c2 & 0xC0) == 0x80) 
{
         c2 = ((c0 & 0x0F) << 12)|((c1 & 0x3F) << 6)|(c2 & 0x3F);
         if(c2 < 0x800) goto Linvalid;
         return c2;
       }
       else if(str.length > 3) {
         dchar c3 = str.ptr[3];
         if(c0 < 0xF5 && (c1 & 0xC0) == 0x80 && (c2 & 0xC0) == 
0x80 && (c3 & 0xC0) == 0x80) {
           c3 = ((c0 & 0x07) << 16)|((c1 & 0x3F) << 12)|((c2 & 
0x3F) << 6)|(c3 & 0x3F);
           if(c3 < 0x10000  || c3 > 0x10ffff) goto Linvalid;
           return c3;
         }
       }
     }
   }
   Linvalid:
      throw new Exception("yadayada");
//assert(myFront2(['\xC2','\xA2'])==0xA3);
}




More information about the Digitalmars-d mailing list