Counting bits in a ulong

H. S. Teoh hsteoh at quickfur.ath.cx
Fri Jun 5 20:03:07 UTC 2020


On Fri, Jun 05, 2020 at 12:45:36PM -0700, H. S. Teoh via Digitalmars-d wrote:
[...]
> You might be running into alignment issues, possibly. For one thing, why
> does LOOKUP_1 have 257 elements instead of 256? That last byte is never
> accessed. Similarly, LOOKUP_2 doesn't need 65537 elements, that last
> element is redundant.
[...]
> Disassembling shows that the code was *not* duplicated 100 times;
> apparently LDC's aggressive optimizer noticed that `count` is reassigned
> every unrolled iteration, so all except the last are no-ops. You need to
> do something with `count` that isn't immediately killed by the next
> unrolled iteration, otherwise your painstakingly copy-n-pasted
> iterations will all be elided by the optimizer. :-P
> 
> OR'ing `count` into ACCUMULATOR after every count is probably what you
> need to do here.
[...]

So I made the above changes, and here are the new measurements:

	Starting tests, which take about 10 seconds each
	Naive:         25272530 iter/sec
	Parallel:      133259870 iter/sec
	Kernighan:     36447830 iter/sec
	Lookup 8-bit:  133837870 iter/sec
	Lookup 16-bit: 283343300 iter/sec

Now the 16-bit lookup makes a lot more sense! :-D  (I also confirmed via
disassembly that the code is indeed unrolled 100 times, and not just
elided.)

Interestingly, the parallel version seems to perform almost just as well
as the 8-bit lookup.


Here's the fixed code (it's a lot shorter :-P):

------------------------------------------------------
import std.stdio;
import std.datetime.systime;

ubyte [257]   LOOKUP_1; // 8-bit lookup table
ubyte [65537] LOOKUP_2; // 16-bit lookup table
const ulong STEPPER = 0x110D0B0705030201;  // val increment step, to minimize cache effects
ulong ACCUMULATOR = 0;  // might keep optimizer from noticing we never use count and tossing it
double TEST_DURATION = 10.0;

// High-resolution time()-like, copied from VS_Common_D:
double now() {
    double tm = Clock.currStdTime();
    return (tm / 10000000.0) - 62135596800.0;
}

ubyte popcount_naive(ulong val) {
    ubyte count;
    foreach (i; 0 .. 64) if (val & (1L << i)) count++;
    return count;
}

ulong microbench_naive() {
    ulong iter_count = 0;
    ulong val = 0;
    uint  count = 0;
    double elapsed;
    double t0 = now();
    while((elapsed = now() - t0) < TEST_DURATION) {
	// unrolling 100x should be enough to amortize overhead of calling now():
	static foreach (_; 0 .. 100) {
	    foreach (i; 0 .. 64) if (val & (1L << i)) count++; val += STEPPER;
	    ACCUMULATOR ^= count;
	}
	iter_count += 100;
    }
    return cast(ulong)(iter_count / elapsed);  // iterations per second
}

ulong microbench_parallel() {
    ulong iter_count = 0;
    ulong val = 0;
    uint  count = 0;
    uint  v1;
    uint  v2;
    double elapsed;
    double t0 = now();
    while((elapsed = now() - t0) < TEST_DURATION) {
	// unrolling 100x should be enough to amortize overhead of calling now():
	static foreach (_; 0 .. 100) {
	    v1 = cast(uint)(val >> 32);
	    v2 = cast(uint)val & 0xFFFF_FFFF;
	    v1 = v1 - ((v1 >> 1) & 0x5555_5555);
	    v2 = v2 - ((v2 >> 1) & 0x5555_5555);
	    v1 = ((v1 >> 2) & 0x3333_3333) + (v1 & 0x3333_3333);
	    v2 = ((v2 >> 2) & 0x3333_3333) + (v2 & 0x3333_3333);
	    v1 = ((v1 >> 4) & 0x0F0F_0F0F) + (v1 & 0x0F0F_0F0F);
	    v2 = ((v2 >> 4) & 0x0F0F_0F0F) + (v2 & 0x0F0F_0F0F);
	    v1 = ((v1 >> 8) & 0x00FF_00FF) + (v1 & 0x00FF_00FF);
	    v2 = ((v2 >> 8) & 0x00FF_00FF) + (v2 & 0x00FF_00FF);
	    v1 = ((v1 >> 16) & 0x0000_FFFF) + (v1 & 0x0000_FFFF);
	    v2 = ((v2 >> 16) & 0x0000_FFFF) + (v2 & 0x0000_FFFF);
	    count += cast(ulong)v1 + cast(ulong)v2;
	    val += STEPPER;
	    ACCUMULATOR ^= count;
	}

	iter_count += 100;
    }
    ACCUMULATOR += count;
    return cast(ulong)(iter_count / elapsed);  // iterations per second
}

ulong microbench_kernighan() {
    ulong iter_count = 0;
    ulong val = 0;
    ulong tval = 0;
    uint  count = 0;
    double elapsed;
    double t0 = now();
    while((elapsed = now() - t0) < TEST_DURATION) {
	// unrolling 100x should be enough to amortize overhead of calling now():
	static foreach (_; 0 .. 100) {
	    for (count = 0; val; count++) val &= val - 1;  val = tval += STEPPER;
	    ACCUMULATOR ^= count;
	}

	iter_count += 100;
    }
    ACCUMULATOR += count;
    return cast(ulong)(iter_count / elapsed);  // iterations per second
}

ulong microbench_lookup_8bit() {
    ulong iter_count = 0;
    ulong val = 0;
    ubyte *av = cast(ubyte*) &val;
    uint  count = 0;
    double elapsed;
    double t0 = now();
    while((elapsed = now() - t0) < TEST_DURATION) {
	// unrolling 100x should be enough to amortize overhead of calling now():
	// writeln("top: iter = ", iter_count, ", val = ", val, ", av = ", av, ", av0 = ", av[0], ", av1 = ", av[1], ", av2 = ", av[2], ", av3 = ", av[3], ", av4 = ", av[4], ", av5 = ", av[5], ", av6 = ", av[6], ", av7 = ", av[7]);
	static foreach (_; 0 .. 100) {
	    count = LOOKUP_1[av[0]] + LOOKUP_1[av[1]] + LOOKUP_1[av[2]] + LOOKUP_1[av[3]] + LOOKUP_1[av[4]] + LOOKUP_1[av[5]] + LOOKUP_1[av[6]] + LOOKUP_1[av[7]];
	    val += STEPPER;
	    ACCUMULATOR ^= count;
	}

	iter_count += 100;
    }
    ACCUMULATOR += count;
    return cast(ulong)(iter_count / elapsed);  // iterations per second
}

ulong microbench_lookup_16bit() {
    ulong iter_count = 0;
    ulong val = 0;
    ushort *av = cast(ushort*) &val;
    uint  count = 0;
    double elapsed;
    double t0 = now();
    while((elapsed = now() - t0) < TEST_DURATION) {
	// unrolling 100x should be enough to amortize overhead of calling now():
	static foreach (_; 0 .. 100) {
	    count = LOOKUP_2[av[0]] + LOOKUP_2[av[1]] + LOOKUP_2[av[2]] + LOOKUP_2[av[3]];
	    val += STEPPER;
	    ACCUMULATOR ^= count;
	}

	iter_count += 100;
    }
    ACCUMULATOR += count;
    return cast(ulong)(iter_count / elapsed);  // iterations per second
}

int main() {
    // initialize the lookup tables:
    foreach (i; 0 ..   256) LOOKUP_1[i] = popcount_naive(i);
    foreach (i; 0 .. 65536) LOOKUP_2[i] = popcount_naive(i);

    writeln("Starting tests, which take about ", TEST_DURATION, " seconds each");
    writeln("Naive:         ", microbench_naive(),        " iter/sec");
    writeln("Parallel:      ", microbench_parallel(),     " iter/sec");
    writeln("Kernighan:     ", microbench_kernighan(),    " iter/sec");
    writeln("Lookup 8-bit:  ", microbench_lookup_8bit(),  " iter/sec");
    writeln("Lookup 16-bit: ", microbench_lookup_16bit(), " iter/sec");

    // trick optimizer into thinking ACCUMULATOR, and thus count, is important:
    return ACCUMULATOR / STEPPER;
}
------------------------------------------------------


T

-- 
In a world without fences, who needs Windows and Gates? -- Christian Surchi


More information about the Digitalmars-d mailing list