Strange counter-performance in an alternative `decimalLength9` function
Basile B.
b2.temp at gmx.com
Thu Feb 27 08:52:09 UTC 2020
On Thursday, 27 February 2020 at 04:44:56 UTC, Basile B. wrote:
> On Thursday, 27 February 2020 at 03:58:15 UTC, Bruce Carneal
> wrote:
>>>
>>> Maybe you talked about another implementation of
>>> decimalLength9 ?
>>
>> Yes. It's one I wrote after I saw your post. Psuedo-code here:
>>
>> auto d9_branchless(uint v) { return 1 + (v >= 10) + (v >=
>> 100) ... }
>>
>> Using ldc to target an x86 with the above yields a series of
>> cmpl, seta instruction pairs in the function body followed by
>> a summing and a return. No branching.
>>
>> Let me know if the above is unclear or insufficient.
>
> No thanks, it's crystal clear now.
although I don't see the performance gain. Now for me an hybrid
version based on a LUT and on the branchless one is the fatest
(decimalLength9_5), although still slowest then the original.
updated program, incl your branchless version (decimalLength9_4):
---
#!ldmd -boundscheck=off -release -inline -O -mcpu=native
-mattr=+sse2,+sse3,+sse4.1,+sse4.2,+fast-lzcnt,+avx,+avx2,+cmov,+bmi,+bmi2
import core.memory;
import core.bitop;
import std.stdio;
import std.range;
import std.algorithm;
import std.getopt;
import std.random;
ubyte decimalLength9_0(const uint v)
{
if (v >= 100000000) { return 9; }
if (v >= 10000000) { return 8; }
if (v >= 1000000) { return 7; }
if (v >= 100000) { return 6; }
if (v >= 10000) { return 5; }
if (v >= 1000) { return 4; }
if (v >= 100) { return 3; }
if (v >= 10) { return 2; }
return 1;
}
ubyte decimalLength9_1(const uint v) pure nothrow
{
if (v == 0) // BSR and LZCNT UB when input is 0
return 1;
const ubyte lzc = cast(ubyte) bsr(v);
ubyte result;
switch (lzc)
{
case 0 :
case 1 :
case 2 : result = 1; break;
case 3 : result = v >= 10 ? 2 : 1; break;
case 4 :
case 5 : result = 2; break;
case 6 : result = v >= 100 ? 3 : 2; break;
case 7 :
case 8 : result = 3; break;
case 9 : result = v >= 1000 ? 4 : 3; break;
case 10:
case 11:
case 12: result = 4; break;
case 13: result = v >= 10000 ? 5 : 4; break;
case 14:
case 15: result = 5; break;
case 16: result = v >= 100000 ? 6 : 5; break;
case 17:
case 18: result = 6; break;
case 19: result = v >= 1000000 ? 7 : 6; break;
case 20:
case 21:
case 22: result = 7; break;
case 23: result = v >= 10000000 ? 8 : 7; break;
case 24:
case 25: result = 8; break;
case 26: result = v >= 100000000 ? 9 : 8; break;
case 27:
case 28:
case 29:
case 30:
case 31: result = 9; break;
default: assert(false);
}
return result;
}
private ubyte decimalLength9_2(const uint v) pure nothrow
{
if (v == 0) // BSR and LZCNT UB when input is 0
return 1;
const ubyte lzc = cast(ubyte) bsr(v);
static immutable pure nothrow ubyte function(uint)[32] tbl =
[
0 : (uint a) => ubyte(1),
1 : (uint a) => ubyte(1),
2 : (uint a) => ubyte(1),
3 : (uint a) => a >= 10 ? ubyte(2) : ubyte(1),
4 : (uint a) => ubyte(2),
5 : (uint a) => ubyte(2),
6 : (uint a) => a >= 100 ? ubyte(3) : ubyte(2),
7 : (uint a) => ubyte(3),
8 : (uint a) => ubyte(3),
9 : (uint a) => a >= 1000 ? ubyte(4) : ubyte(3),
10: (uint a) => ubyte(4),
11: (uint a) => ubyte(4),
12: (uint a) => ubyte(4),
13: (uint a) => a >= 10000 ? ubyte(5) : ubyte(4),
14: (uint a) => ubyte(5),
15: (uint a) => ubyte(5),
16: (uint a) => a >= 100000 ? ubyte(6) : ubyte(5),
17: (uint a) => ubyte(6),
18: (uint a) => ubyte(6),
19: (uint a) => a >= 1000000 ? ubyte(7) : ubyte(6),
20: (uint a) => ubyte(7),
21: (uint a) => ubyte(7),
22: (uint a) => ubyte(7),
23: (uint a) => a >= 10000000 ? ubyte(8) : ubyte(7),
24: (uint a) => ubyte(8),
25: (uint a) => ubyte(8),
26: (uint a) => a >= 100000000 ? ubyte(9) : ubyte(8),
27: (uint a) => ubyte(9),
28: (uint a) => ubyte(9),
29: (uint a) => ubyte(9),
30: (uint a) => ubyte(9),
31: (uint a) => ubyte(9),
];
return tbl[lzc](v);
}
ubyte decimalLength9_3(const uint v) pure nothrow
{
if (v == 0) // BSR and LZCNT UB when input is 0
return 1;
ubyte result;
enum ubyte doSwitch = ubyte(0);
const ubyte lzc = cast(ubyte) bsr(v);
const ubyte[32] decimalLength9LUT =
[
0 : ubyte(1),
1 : ubyte(1),
2 : ubyte(1),
3 : doSwitch,
4 : ubyte(2),
5 : ubyte(2),
6 : doSwitch,
7 : ubyte(3),
8 : ubyte(3),
9 : doSwitch,
10: ubyte(4),
11: ubyte(4),
12: ubyte(4),
13: doSwitch,
14: ubyte(5),
15: ubyte(5),
16: doSwitch,
17: ubyte(6),
18: ubyte(6),
19: doSwitch,
20: ubyte(7),
21: ubyte(7),
22: ubyte(7),
23: doSwitch,
24: ubyte(8),
25: ubyte(8),
26: doSwitch,
27: ubyte(9),
28: ubyte(9),
29: ubyte(9),
30: ubyte(9),
31: ubyte(9),
];
ubyte resultOrSelector = decimalLength9LUT[lzc];
if (resultOrSelector != doSwitch)
result = resultOrSelector;
else switch (lzc)
{
case 3 : result = v >= 10 ? 2 : 1; break;
case 6 : result = v >= 100 ? 3 : 2; break;
case 9 : result = v >= 1000 ? 4 : 3; break;
case 13: result = v >= 10000 ? 5 : 4; break;
case 16: result = v >= 100000 ? 6 : 5; break;
case 19: result = v >= 1000000 ? 7 : 6; break;
case 23: result = v >= 10000000 ? 8 : 7; break;
case 26: result = v >= 100000000 ? 9 : 8; break;
default: assert(false);
}
return result;
}
ubyte decimalLength9_4(const uint v) pure nothrow
{
return 1 + (v >= 10) +
(v >= 100) +
(v >= 1000) +
(v >= 10000) +
(v >= 100000) +
(v >= 1000000) +
(v >= 10000000) +
(v >= 100000000) ;
}
ubyte decimalLength9_5(const uint v) pure nothrow
{
if (v == 0) // BSR and LZCNT UB when input is 0
return 1;
ubyte result;
enum ubyte doBranchlessWay = ubyte(0);
const ubyte lzc = cast(ubyte) bsr(v);
const ubyte[32] decimalLength9LUT =
[
0 : ubyte(1),
1 : ubyte(1),
2 : ubyte(1),
3 : doBranchlessWay,
4 : ubyte(2),
5 : ubyte(2),
6 : doBranchlessWay,
7 : ubyte(3),
8 : ubyte(3),
9 : doBranchlessWay,
10: ubyte(4),
11: ubyte(4),
12: ubyte(4),
13: doBranchlessWay,
14: ubyte(5),
15: ubyte(5),
16: doBranchlessWay,
17: ubyte(6),
18: ubyte(6),
19: doBranchlessWay,
20: ubyte(7),
21: ubyte(7),
22: ubyte(7),
23: doBranchlessWay,
24: ubyte(8),
25: ubyte(8),
26: doBranchlessWay,
27: ubyte(9),
28: ubyte(9),
29: ubyte(9),
30: ubyte(9),
31: ubyte(9),
];
ubyte resultOrSelector = decimalLength9LUT[lzc];
if (resultOrSelector != doBranchlessWay)
result = resultOrSelector;
else
{
result = 1 + (v >= 10) +
(v >= 100) +
(v >= 1000) +
(v >= 10000) +
(v >= 100000) +
(v >= 1000000) +
(v >= 10000000) +
(v >= 100000000) ;
}
return result;
}
void main(string[] args)
{
uint sum;
ulong count;
uint seed;
ubyte func;
GC.disable;
getopt(args, config.passThrough, "c|count", &count, "f|func",
&func, "s|seed", &seed);
static const funcs = [&decimalLength9_0, &decimalLength9_1,
&decimalLength9_2, &decimalLength9_3, &decimalLength9_4,
&decimalLength9_5];
rndGen.seed(seed);
foreach (const ulong i; 0 .. count)
{
sum += funcs[func](rndGen.front());
rndGen.popFront();
}
writeln(sum);
}
---
Could you share your benchmarking method maybe ?
More information about the Digitalmars-d-learn
mailing list