Counting bits in a ulong
H. S. Teoh
hsteoh at quickfur.ath.cx
Fri Jun 5 20:03:07 UTC 2020
On Fri, Jun 05, 2020 at 12:45:36PM -0700, H. S. Teoh via Digitalmars-d wrote:
[...]
> You might be running into alignment issues, possibly. For one thing, why
> does LOOKUP_1 have 257 elements instead of 256? That last byte is never
> accessed. Similarly, LOOKUP_2 doesn't need 65537 elements, that last
> element is redundant.
[...]
> Disassembling shows that the code was *not* duplicated 100 times;
> apparently LDC's aggressive optimizer noticed that `count` is reassigned
> every unrolled iteration, so all except the last are no-ops. You need to
> do something with `count` that isn't immediately killed by the next
> unrolled iteration, otherwise your painstakingly copy-n-pasted
> iterations will all be elided by the optimizer. :-P
>
> OR'ing `count` into ACCUMULATOR after every count is probably what you
> need to do here.
[...]
So I made the above changes, and here are the new measurements:
Starting tests, which take about 10 seconds each
Naive: 25272530 iter/sec
Parallel: 133259870 iter/sec
Kernighan: 36447830 iter/sec
Lookup 8-bit: 133837870 iter/sec
Lookup 16-bit: 283343300 iter/sec
Now the 16-bit lookup makes a lot more sense! :-D (I also confirmed via
disassembly that the code is indeed unrolled 100 times, and not just
elided.)
Interestingly, the parallel version seems to perform almost just as well
as the 8-bit lookup.
Here's the fixed code (it's a lot shorter :-P):
------------------------------------------------------
import std.stdio;
import std.datetime.systime;
ubyte [257] LOOKUP_1; // 8-bit lookup table
ubyte [65537] LOOKUP_2; // 16-bit lookup table
const ulong STEPPER = 0x110D0B0705030201; // val increment step, to minimize cache effects
ulong ACCUMULATOR = 0; // might keep optimizer from noticing we never use count and tossing it
double TEST_DURATION = 10.0;
// High-resolution time()-like, copied from VS_Common_D:
double now() {
double tm = Clock.currStdTime();
return (tm / 10000000.0) - 62135596800.0;
}
ubyte popcount_naive(ulong val) {
ubyte count;
foreach (i; 0 .. 64) if (val & (1L << i)) count++;
return count;
}
ulong microbench_naive() {
ulong iter_count = 0;
ulong val = 0;
uint count = 0;
double elapsed;
double t0 = now();
while((elapsed = now() - t0) < TEST_DURATION) {
// unrolling 100x should be enough to amortize overhead of calling now():
static foreach (_; 0 .. 100) {
foreach (i; 0 .. 64) if (val & (1L << i)) count++; val += STEPPER;
ACCUMULATOR ^= count;
}
iter_count += 100;
}
return cast(ulong)(iter_count / elapsed); // iterations per second
}
ulong microbench_parallel() {
ulong iter_count = 0;
ulong val = 0;
uint count = 0;
uint v1;
uint v2;
double elapsed;
double t0 = now();
while((elapsed = now() - t0) < TEST_DURATION) {
// unrolling 100x should be enough to amortize overhead of calling now():
static foreach (_; 0 .. 100) {
v1 = cast(uint)(val >> 32);
v2 = cast(uint)val & 0xFFFF_FFFF;
v1 = v1 - ((v1 >> 1) & 0x5555_5555);
v2 = v2 - ((v2 >> 1) & 0x5555_5555);
v1 = ((v1 >> 2) & 0x3333_3333) + (v1 & 0x3333_3333);
v2 = ((v2 >> 2) & 0x3333_3333) + (v2 & 0x3333_3333);
v1 = ((v1 >> 4) & 0x0F0F_0F0F) + (v1 & 0x0F0F_0F0F);
v2 = ((v2 >> 4) & 0x0F0F_0F0F) + (v2 & 0x0F0F_0F0F);
v1 = ((v1 >> 8) & 0x00FF_00FF) + (v1 & 0x00FF_00FF);
v2 = ((v2 >> 8) & 0x00FF_00FF) + (v2 & 0x00FF_00FF);
v1 = ((v1 >> 16) & 0x0000_FFFF) + (v1 & 0x0000_FFFF);
v2 = ((v2 >> 16) & 0x0000_FFFF) + (v2 & 0x0000_FFFF);
count += cast(ulong)v1 + cast(ulong)v2;
val += STEPPER;
ACCUMULATOR ^= count;
}
iter_count += 100;
}
ACCUMULATOR += count;
return cast(ulong)(iter_count / elapsed); // iterations per second
}
ulong microbench_kernighan() {
ulong iter_count = 0;
ulong val = 0;
ulong tval = 0;
uint count = 0;
double elapsed;
double t0 = now();
while((elapsed = now() - t0) < TEST_DURATION) {
// unrolling 100x should be enough to amortize overhead of calling now():
static foreach (_; 0 .. 100) {
for (count = 0; val; count++) val &= val - 1; val = tval += STEPPER;
ACCUMULATOR ^= count;
}
iter_count += 100;
}
ACCUMULATOR += count;
return cast(ulong)(iter_count / elapsed); // iterations per second
}
ulong microbench_lookup_8bit() {
ulong iter_count = 0;
ulong val = 0;
ubyte *av = cast(ubyte*) &val;
uint count = 0;
double elapsed;
double t0 = now();
while((elapsed = now() - t0) < TEST_DURATION) {
// unrolling 100x should be enough to amortize overhead of calling now():
// writeln("top: iter = ", iter_count, ", val = ", val, ", av = ", av, ", av0 = ", av[0], ", av1 = ", av[1], ", av2 = ", av[2], ", av3 = ", av[3], ", av4 = ", av[4], ", av5 = ", av[5], ", av6 = ", av[6], ", av7 = ", av[7]);
static foreach (_; 0 .. 100) {
count = LOOKUP_1[av[0]] + LOOKUP_1[av[1]] + LOOKUP_1[av[2]] + LOOKUP_1[av[3]] + LOOKUP_1[av[4]] + LOOKUP_1[av[5]] + LOOKUP_1[av[6]] + LOOKUP_1[av[7]];
val += STEPPER;
ACCUMULATOR ^= count;
}
iter_count += 100;
}
ACCUMULATOR += count;
return cast(ulong)(iter_count / elapsed); // iterations per second
}
ulong microbench_lookup_16bit() {
ulong iter_count = 0;
ulong val = 0;
ushort *av = cast(ushort*) &val;
uint count = 0;
double elapsed;
double t0 = now();
while((elapsed = now() - t0) < TEST_DURATION) {
// unrolling 100x should be enough to amortize overhead of calling now():
static foreach (_; 0 .. 100) {
count = LOOKUP_2[av[0]] + LOOKUP_2[av[1]] + LOOKUP_2[av[2]] + LOOKUP_2[av[3]];
val += STEPPER;
ACCUMULATOR ^= count;
}
iter_count += 100;
}
ACCUMULATOR += count;
return cast(ulong)(iter_count / elapsed); // iterations per second
}
int main() {
// initialize the lookup tables:
foreach (i; 0 .. 256) LOOKUP_1[i] = popcount_naive(i);
foreach (i; 0 .. 65536) LOOKUP_2[i] = popcount_naive(i);
writeln("Starting tests, which take about ", TEST_DURATION, " seconds each");
writeln("Naive: ", microbench_naive(), " iter/sec");
writeln("Parallel: ", microbench_parallel(), " iter/sec");
writeln("Kernighan: ", microbench_kernighan(), " iter/sec");
writeln("Lookup 8-bit: ", microbench_lookup_8bit(), " iter/sec");
writeln("Lookup 16-bit: ", microbench_lookup_16bit(), " iter/sec");
// trick optimizer into thinking ACCUMULATOR, and thus count, is important:
return ACCUMULATOR / STEPPER;
}
------------------------------------------------------
T
--
In a world without fences, who needs Windows and Gates? -- Christian Surchi
More information about the Digitalmars-d
mailing list