Strange counter-performance in an alternative `decimalLength9` function

Basile B. b2.temp at gmx.com
Thu Feb 27 08:52:09 UTC 2020


On Thursday, 27 February 2020 at 04:44:56 UTC, Basile B. wrote:
> On Thursday, 27 February 2020 at 03:58:15 UTC, Bruce Carneal 
> wrote:
>>>
>>> Maybe you talked about another implementation of 
>>> decimalLength9 ?
>>
>> Yes.  It's one I wrote after I saw your post. Psuedo-code here:
>>
>>   auto d9_branchless(uint v) { return 1 + (v >= 10) + (v >= 
>> 100) ... }
>>
>> Using ldc to target an x86 with the above yields a series of 
>> cmpl, seta instruction pairs in the function body followed by 
>> a summing and a return.  No branching.
>>
>> Let me know if the above is unclear or insufficient.
>
> No thanks, it's crystal clear now.

although I don't see the performance gain. Now for me an hybrid 
version based on a LUT and  on the branchless one is the fatest 
(decimalLength9_5), although still slowest then the original.

updated program, incl your branchless version (decimalLength9_4):

---
#!ldmd -boundscheck=off -release -inline -O -mcpu=native 
-mattr=+sse2,+sse3,+sse4.1,+sse4.2,+fast-lzcnt,+avx,+avx2,+cmov,+bmi,+bmi2
import core.memory;
import core.bitop;
import std.stdio;
import std.range;
import std.algorithm;
import std.getopt;
import std.random;

ubyte decimalLength9_0(const uint v)
{
     if (v >= 100000000) { return 9; }
     if (v >= 10000000) { return 8; }
     if (v >= 1000000) { return 7; }
     if (v >= 100000) { return 6; }
     if (v >= 10000) { return 5; }
     if (v >= 1000) { return 4; }
     if (v >= 100) { return 3; }
     if (v >= 10) { return 2; }
     return 1;
}

ubyte decimalLength9_1(const uint v) pure nothrow
{
     if (v == 0) // BSR and LZCNT UB when input is 0
         return 1;
     const ubyte lzc = cast(ubyte) bsr(v);
     ubyte result;
     switch (lzc)
     {
         case 0 :
         case 1 :
         case 2 : result =  1; break;
         case 3 : result =  v >= 10 ? 2 : 1; break;
         case 4 :
         case 5 : result =  2; break;
         case 6 : result =  v >= 100 ? 3 : 2; break;
         case 7 :
         case 8 : result =  3; break;
         case 9 : result =  v >= 1000 ? 4 : 3; break;
         case 10:
         case 11:
         case 12: result =  4; break;
         case 13: result =  v >= 10000 ? 5 : 4; break;
         case 14:
         case 15: result =  5; break;
         case 16: result =  v >= 100000 ? 6 : 5; break;
         case 17:
         case 18: result =  6; break;
         case 19: result =  v >= 1000000 ? 7 : 6; break;
         case 20:
         case 21:
         case 22: result =  7; break;
         case 23: result =  v >= 10000000 ? 8 : 7; break;
         case 24:
         case 25: result =  8; break;
         case 26: result =  v >= 100000000 ? 9 : 8; break;
         case 27:
         case 28:
         case 29:
         case 30:
         case 31: result =  9; break;
         default: assert(false);
     }
     return result;
}

private ubyte decimalLength9_2(const uint v) pure nothrow
{
     if (v == 0) // BSR and LZCNT UB when input is 0
         return 1;
     const ubyte lzc = cast(ubyte) bsr(v);
     static immutable pure nothrow ubyte function(uint)[32] tbl =
     [
     0 : (uint a) => ubyte(1),
     1 : (uint a) => ubyte(1),
     2 : (uint a) => ubyte(1),
     3 : (uint a) => a >= 10  ? ubyte(2) : ubyte(1),
     4 : (uint a) => ubyte(2),
     5 : (uint a) => ubyte(2),
     6 : (uint a) => a >= 100  ? ubyte(3) : ubyte(2),
     7 : (uint a) => ubyte(3),
     8 : (uint a) => ubyte(3),
     9 : (uint a) => a >= 1000  ? ubyte(4) : ubyte(3),
     10: (uint a) => ubyte(4),
     11: (uint a) => ubyte(4),
     12: (uint a) => ubyte(4),
     13: (uint a) => a >= 10000  ? ubyte(5) : ubyte(4),
     14: (uint a) => ubyte(5),
     15: (uint a) => ubyte(5),
     16: (uint a) => a >= 100000  ? ubyte(6) : ubyte(5),
     17: (uint a) => ubyte(6),
     18: (uint a) => ubyte(6),
     19: (uint a) => a >= 1000000  ? ubyte(7) : ubyte(6),
     20: (uint a) => ubyte(7),
     21: (uint a) => ubyte(7),
     22: (uint a) => ubyte(7),
     23: (uint a) => a >= 10000000  ? ubyte(8) : ubyte(7),
     24: (uint a) => ubyte(8),
     25: (uint a) => ubyte(8),
     26: (uint a) => a >= 100000000  ? ubyte(9) : ubyte(8),
     27: (uint a) => ubyte(9),
     28: (uint a) => ubyte(9),
     29: (uint a) => ubyte(9),
     30: (uint a) => ubyte(9),
     31: (uint a) => ubyte(9),
     ];
     return tbl[lzc](v);
}

ubyte decimalLength9_3(const uint v) pure nothrow
{
     if (v == 0) // BSR and LZCNT UB when input is 0
         return 1;
     ubyte result;
     enum ubyte doSwitch = ubyte(0);
     const ubyte lzc = cast(ubyte) bsr(v);
     const ubyte[32] decimalLength9LUT =
     [
     0 : ubyte(1),
     1 : ubyte(1),
     2 : ubyte(1),
     3 : doSwitch,
     4 : ubyte(2),
     5 : ubyte(2),
     6 : doSwitch,
     7 : ubyte(3),
     8 : ubyte(3),
     9 : doSwitch,
     10: ubyte(4),
     11: ubyte(4),
     12: ubyte(4),
     13: doSwitch,
     14: ubyte(5),
     15: ubyte(5),
     16: doSwitch,
     17: ubyte(6),
     18: ubyte(6),
     19: doSwitch,
     20: ubyte(7),
     21: ubyte(7),
     22: ubyte(7),
     23: doSwitch,
     24: ubyte(8),
     25: ubyte(8),
     26: doSwitch,
     27: ubyte(9),
     28: ubyte(9),
     29: ubyte(9),
     30: ubyte(9),
     31: ubyte(9),
     ];
     ubyte resultOrSelector = decimalLength9LUT[lzc];
     if (resultOrSelector != doSwitch)
         result = resultOrSelector;
     else switch (lzc)
     {
         case 3 : result =  v >= 10 ? 2 : 1; break;
         case 6 : result =  v >= 100 ? 3 : 2; break;
         case 9 : result =  v >= 1000 ? 4 : 3; break;
         case 13: result =  v >= 10000 ? 5 : 4; break;
         case 16: result =  v >= 100000 ? 6 : 5; break;
         case 19: result =  v >= 1000000 ? 7 : 6; break;
         case 23: result =  v >= 10000000 ? 8 : 7; break;
         case 26: result =  v >= 100000000 ? 9 : 8; break;
         default: assert(false);
     }
     return result;
}

ubyte decimalLength9_4(const uint v) pure nothrow
{
     return 1 +  (v >= 10) +
                 (v >= 100) +
                 (v >= 1000) +
                 (v >= 10000) +
                 (v >= 100000) +
                 (v >= 1000000) +
                 (v >= 10000000) +
                 (v >= 100000000) ;
}

ubyte decimalLength9_5(const uint v) pure nothrow
{
     if (v == 0) // BSR and LZCNT UB when input is 0
         return 1;
     ubyte result;
     enum ubyte doBranchlessWay = ubyte(0);
     const ubyte lzc = cast(ubyte) bsr(v);
     const ubyte[32] decimalLength9LUT =
     [
     0 : ubyte(1),
     1 : ubyte(1),
     2 : ubyte(1),
     3 : doBranchlessWay,
     4 : ubyte(2),
     5 : ubyte(2),
     6 : doBranchlessWay,
     7 : ubyte(3),
     8 : ubyte(3),
     9 : doBranchlessWay,
     10: ubyte(4),
     11: ubyte(4),
     12: ubyte(4),
     13: doBranchlessWay,
     14: ubyte(5),
     15: ubyte(5),
     16: doBranchlessWay,
     17: ubyte(6),
     18: ubyte(6),
     19: doBranchlessWay,
     20: ubyte(7),
     21: ubyte(7),
     22: ubyte(7),
     23: doBranchlessWay,
     24: ubyte(8),
     25: ubyte(8),
     26: doBranchlessWay,
     27: ubyte(9),
     28: ubyte(9),
     29: ubyte(9),
     30: ubyte(9),
     31: ubyte(9),
     ];
     ubyte resultOrSelector = decimalLength9LUT[lzc];
     if (resultOrSelector != doBranchlessWay)
         result = resultOrSelector;
     else
     {
     result = 1 + (v >= 10) +
                  (v >= 100) +
                  (v >= 1000) +
                  (v >= 10000) +
                  (v >= 100000) +
                  (v >= 1000000) +
                  (v >= 10000000) +
                  (v >= 100000000) ;
     }
     return result;
}

void main(string[] args)
{
     uint    sum;
     ulong   count;
     uint    seed;
     ubyte   func;

     GC.disable;
     getopt(args, config.passThrough, "c|count", &count, "f|func", 
&func, "s|seed", &seed);
     static const funcs = [&decimalLength9_0, &decimalLength9_1, 
&decimalLength9_2, &decimalLength9_3, &decimalLength9_4, 
&decimalLength9_5];

     rndGen.seed(seed);
     foreach (const ulong i; 0 .. count)
     {
         sum += funcs[func](rndGen.front());
         rndGen.popFront();
     }
     writeln(sum);
}
---

Could you share your benchmarking method maybe ?


More information about the Digitalmars-d-learn mailing list