Strange counter-performance in an alternative `decimalLength9` function

Basile B. b2.temp at gmx.com
Wed Feb 26 13:50:11 UTC 2020


On Wednesday, 26 February 2020 at 00:50:35 UTC, Basile B. wrote:
> How is that possible ?

It turns out that there's a problem with the benchmarking method. 
With command line argument the different optimization passes of 
LLVM don't fuck up with the literal constants.

It appears that none of the alternatives based on the "most 
significant bit trick" are faster (at least when covering a 
decent range of numbers):

---
#!ldmd -boundscheck=off -release -inline -O -mcpu=native 
-mattr=+sse2,+sse3,+sse4.1,+sse4.2,+fast-lzcnt,+avx,+avx2,+cmov,+bmi,+bmi2
import core.memory;
import core.bitop;
import std.stdio;
import std.range;
import std.algorithm;
import std.getopt;

ubyte decimalLength9_0(const uint v)
{
     if (v >= 100000000) { return 9; }
     if (v >= 10000000) { return 8; }
     if (v >= 1000000) { return 7; }
     if (v >= 100000) { return 6; }
     if (v >= 10000) { return 5; }
     if (v >= 1000) { return 4; }
     if (v >= 100) { return 3; }
     if (v >= 10) { return 2; }
     return 1;
}

ubyte decimalLength9_1(const uint v) pure nothrow
{
     if (v == 0) // BSR and LZCNT UB when input is 0
         return 1;
     const ubyte lzc = cast(ubyte) bsr(v);
     ubyte result;
     switch (lzc)
     {
         case 0 :
         case 1 :
         case 2 : result =  1; break;
         case 3 : result =  v >= 10 ? 2 : 1; break;
         case 4 :
         case 5 : result =  2; break;
         case 6 : result =  v >= 100 ? 3 : 2; break;
         case 7 :
         case 8 : result =  3; break;
         case 9 : result =  v >= 1000 ? 4 : 3; break;
         case 10:
         case 11:
         case 12: result =  4; break;
         case 13: result =  v >= 10000 ? 5 : 4; break;
         case 14:
         case 15: result =  5; break;
         case 16: result =  v >= 100000 ? 6 : 5; break;
         case 17:
         case 18: result =  6; break;
         case 19: result =  v >= 1000000 ? 7 : 6; break;
         case 20:
         case 21:
         case 22: result =  7; break;
         case 23: result =  v >= 10000000 ? 8 : 7; break;
         case 24:
         case 25: result =  8; break;
         case 26: result =  v >= 100000000 ? 9 : 8; break;
         case 27:
         case 28:
         case 29:
         case 30:
         case 31: result =  9; break;

         default: assert(false);
     }
     return result;
}

private ubyte decimalLength9_2(const uint v) pure nothrow
{
     if (v == 0) // BSR and LZCNT UB when input is 0
         return 1;
     const ubyte lzc = cast(ubyte) bsr(v);
     static immutable pure nothrow ubyte function(uint)[32] tbl =
     [
     0 : (uint a) => ubyte(1),
     1 : (uint a) => ubyte(1),
     2 : (uint a) => ubyte(1),
     3 : (uint a) => a >= 10  ? ubyte(2) : ubyte(1),
     4 : (uint a) => ubyte(2),
     5 : (uint a) => ubyte(2),
     6 : (uint a) => a >= 100  ? ubyte(3) : ubyte(2),
     7 : (uint a) => ubyte(3),
     8 : (uint a) => ubyte(3),
     9 : (uint a) => a >= 1000  ? ubyte(4) : ubyte(3),
     10: (uint a) => ubyte(4),
     11: (uint a) => ubyte(4),
     12: (uint a) => ubyte(4),
     13: (uint a) => a >= 10000  ? ubyte(5) : ubyte(4),
     14: (uint a) => ubyte(5),
     15: (uint a) => ubyte(5),
     16: (uint a) => a >= 100000  ? ubyte(6) : ubyte(5),
     17: (uint a) => ubyte(6),
     18: (uint a) => ubyte(6),
     19: (uint a) => a >= 1000000  ? ubyte(7) : ubyte(6),
     20: (uint a) => ubyte(7),
     21: (uint a) => ubyte(7),
     22: (uint a) => ubyte(7),
     23: (uint a) => a >= 10000000  ? ubyte(8) : ubyte(7),
     24: (uint a) => ubyte(8),
     25: (uint a) => ubyte(8),
     26: (uint a) => a >= 100000000  ? ubyte(9) : ubyte(8),
     27: (uint a) => ubyte(9),
     28: (uint a) => ubyte(9),
     29: (uint a) => ubyte(9),
     30: (uint a) => ubyte(9),
     31: (uint a) => ubyte(9),
     ];
     return tbl[lzc](v);
}

ubyte decimalLength9_3(const uint v) pure nothrow
{
     if (v == 0) // BSR and LZCNT UB when input is 0
         return 1;
     ubyte result;
     const ubyte lzc = cast(ubyte) bsr(v);
     const ubyte[32] decimalLength9LUT =
     [
     0 : ubyte(1),
     1 : ubyte(1),
     2 : ubyte(1),
     3 : ubyte(10),
     4 : ubyte(2),
     5 : ubyte(2),
     6 : ubyte(11),
     7 : ubyte(3),
     8 : ubyte(3),
     9 : ubyte(12),
     10: ubyte(4),
     11: ubyte(4),
     12: ubyte(4),
     13: ubyte(12),
     14: ubyte(5),
     15: ubyte(5),
     16: ubyte(13),
     17: ubyte(6),
     18: ubyte(6),
     19: ubyte(14),
     20: ubyte(7),
     21: ubyte(7),
     22: ubyte(7),
     23: ubyte(15),
     24: ubyte(8),
     25: ubyte(8),
     26: ubyte(16),
     27: ubyte(9),
     28: ubyte(9),
     29: ubyte(9),
     30: ubyte(9),
     31: ubyte(9),
     ];
     ubyte resultOrSelector = decimalLength9LUT[lzc];
     if (resultOrSelector < 10)
         result = resultOrSelector;
     else switch (lzc)
     {
         case 3 : result =  v >= 10 ? 2 : 1; break;
         case 6 : result =  v >= 100 ? 3 : 2; break;
         case 9 : result =  v >= 1000 ? 4 : 3; break;
         case 13: result =  v >= 10000 ? 5 : 4; break;
         case 16: result =  v >= 100000 ? 6 : 5; break;
         case 19: result =  v >= 1000000 ? 7 : 6; break;
         case 23: result =  v >= 10000000 ? 8 : 7; break;
         case 26: result =  v >= 100000000 ? 9 : 8; break;
         default: assert(false);
     }
     return result;
}

void main(string[] args)
{
     uint    sum;
     uint    count;
     ubyte   func;

     GC.disable;
     getopt(args, config.passThrough, "c|count", &count, "f|func", 
&func);
     static const funcs = [&decimalLength9_0, &decimalLength9_1, 
&decimalLength9_2, &decimalLength9_3];

     foreach (i; 0 .. count)
         sum += funcs[func](i);
     writeln(sum);
}
---

Results:

---
[mr_x at pc1 temp]$ time ./declen --count=100000000 --func=0
788888890

real    0m0,207s
user    0m0,203s
sys     0m0,003s
[mr_x at pc1 temp]$ time ./declen --count=100000000 --func=1
788888890

real    0m0,369s
user    0m0,366s
sys     0m0,002s
[mr_x at pc1 temp]$ time ./declen --count=100000000 --func=2
788888890

real    0m0,271s
user    0m0,264s
sys     0m0,003s
[mr_x at pc1 temp]$ time ./declen --count=100000000 --func=3
788888890

real    0m0,253s
user    0m0,250s
sys     0m0,002s
---

That's pathetic ;)



More information about the Digitalmars-d-learn mailing list