Reducing the cost of autodecoding

Sat Oct 15 09:42:24 PDT 2016

On Saturday, 15 October 2016 at 00:50:08 UTC, Stefan Koch wrote:
> On Friday, 14 October 2016 at 20:47:39 UTC, Stefan Koch wrote:
>> On Thursday, 13 October 2016 at 21:49:22 UTC, safety0ff wrote:
>>>> Bad benchmark! Bad! -- Andrei
>>>
>>> Also, I suspect a benchmark with a larger loop body might not 
>>> benefit as significantly from branch hints as this one.
>>
>> I disagree in longer loops code compactness is as important as 
>> in small ones.
>>
>> This is about the smallest inline version of decode I could 
>> come up with :
>>
>> __gshared static immutable ubyte[] charWidthTab = [
>>             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
>>             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
>>             3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
>>             4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
>> ];
>>
>> dchar myFront(ref char[] str) pure nothrow
>> {
>>     dchar c = cast(dchar) str[0];
>>     if ((c & 128))
>>     {
>>         if (c & 64)
>>         	final switch(charWidthTab[c - 192])
>>         {
>>             case 2 :
>>                 c |= ((str[1] & 0x80) >> 5);
>>             break;
>>             case 3 :
>>                c |= ((str[1] & 0x80) >> 4);
>>                c |= ((str[2] & 0x80) >> 10);
>>             break;
>>             case 4 :
>>                c |= ((str[1] & 0x80) >> 3);
>>                c |= ((str[2] & 0x80) >> 9);
>>                c |= ((str[3] & 0x80) >> 15);
>>             break;
>>             case 5,6,1 :
>>               goto Linvalid;
>>         }
>>         else
>>         Linvalid :
>>         	c = dchar.init;
>>
>>     }
>> 	return c;
>> }
>
> Disregard all that code.
> It is horribly wrong!
>
> This is more correct : (Tough for some reason it does not pass 
> the unittests)
>
> dchar myFront(ref char[] str) pure
> {
>     dchar c = cast(dchar) str.ptr[0];
>     if (c & 128)
>     {
>         if (c & 64)
>         {
>             auto l = charWidthTab.ptr[c - 192];
>             if (str.length < l)
>                 goto Linvalid;
>
>             final switch (l)
>             {
>             case 2:
>                 c = ((c & ~(64 | 128)) << 6);
>                 c |= (str.ptr[1] & ~0x80);
>                 break;
>             case 3:
>                 c = ((c & ~(32 | 64 | 128)) << 12);
>                 c |= ((str.ptr[1] & ~0x80) << 6);
>                 c |= ((str.ptr[2] & ~0x80));
>                 break;
>             case 4:
>                 c = ((c & ~(16 | 32 | 64 | 128)) << 18);
>                 c |= ((str.ptr[1] & ~0x80) << 12);
>                 c |= ((str.ptr[2] & ~0x80) << 6);
>                 c |= ((str.ptr[3] & ~0x80));
>                 break;
>             case 5, 6, 1:
>                 goto Linvalid;
>             }
>         }
>         else
>     Linvalid : throw new Exception("yadayada");
>
>     }
>     return c;
> }

Looks very verbose to me. I had found in the BSD codebase a very 
clever utf-8 conversion function in C, maybe it can be used here. 
Sorry if I do not participate on the testing as I don't have a 
proper compilation environment here at home. Here the routine I 
use at work (it's in C), put that here for inspiration.

DEFINE_INLINE uint_t xctomb(char *r, wchar_t wc)
{
uint_t u8l = utf8len(wc);

   switch(u8l) {
     /* Note: code falls through cases! */
     case 4: r[3] = 0x80 | (wc & 0x3f); wc >>= 6; wc |= 0x10000;
     case 3: r[2] = 0x80 | (wc & 0x3f); wc >>= 6; wc |= 0x800;
     case 2: r[1] = 0x80 | (wc & 0x3f); wc >>= 6; wc |= 0xc0;
     case 1: r[0] = wc;
   }
   return u8l;
}

utf8len being

DEFINE_INLINE uint_t utf8len(wchar_t wc)
{
   if(wc < 0x80)
     return 1;
   else if(wc < 0x800)
     return 2;
   else
     if(wc < 0x10000)
       return 3;
     else
       return 4;
}

The code generated on SPARC with gcc 3.4.6 was really good. On 
x86_64 with gcc 5.1 was also not bad. I have not tried a lot of 
alternatives as UTF-8 coding is not a bottle neck on our project. 
There's also no check for length 5 and 6 as they are not possible 
on our system, but for here it has to be added. (the 
DEFINE_INLINE macro is either extern inline or inline depending 
on some macro magic that is not of importance here).