std.parallelism curious results

Sun Oct 5 14:40:52 PDT 2014

On Sunday, 5 October 2014 at 21:25:39 UTC, Ali Çehreli wrote:
import std.stdio, std.cstream, std.parallelism, std.datetime, 
std.range, core.atomic;

void main()
{	
	StopWatch sw;
	shared ulong sum1 = 0; ulong sum2 = 0, sum3 = 0, time1, time2, 
time3;

	enum numThreads = 4; // If numThreads is a variable then it 
significantly slows down the process
	ulong iter = 1000000L;
	iter = numThreads*cast(ulong)(iter/numThreads); // Force iter to 
be a multiple of the number of threads so we can partition 
uniformly

	auto thds = parallel(iota(0, cast(uint)iter, 
cast(uint)(iter/numThreads)));

	sw.reset(); sw.start();
	foreach(i; thds) { ulong s = 0; for(ulong k = 0; k < 
iter/numThreads; k++) { s += k; } s += i*iter/numThreads; 
atomicOp!"+="(sum1, s); }
	sw.stop(); time1 = sw.peek().usecs;

	sw.reset();	sw.start();	for (ulong i = 0; i < iter; ++i) { sum2 
+= i; } sw.stop(); time2 = sw.peek().usecs;

	writefln("parallel sum : %s, elapsed %s us", sum1, time1);
	writefln("single thread sum : %s, elapsed %s us", sum2, time2);
	if (time1 > 0) writefln("Efficiency : %s%%", 100*time2/time1);
	din.getc();
}

Playing around with the code above, it seems when numThreads is 
an enum, the execution time is significantly effected(that from 
being < 100% to being >100% efficiency).

results on a 4 core laptop with release builds:

parallel sum : 499999500000, elapsed 2469 us
single thread sum : 499999500000, elapsed 8054 us
Efficiency : 326%

when numThreads is an int:

parallel sum : 499999500000, elapsed 21762 us
single thread sum : 499999500000, elapsed 8033 us
Efficiency : 36%