A few experiments with partial unrolling
Andrei Alexandrescu
SeeWebsiteForEmail at erdani.org
Sat Dec 25 12:54:44 PST 2010
I was looking at http://d.puremagic.com/issues/show_bug.cgi?id=4725 and
thought I'd experiment a bit with a generic unrolling routine (code at
the end of this). So I wrote a general parameterized unroll function
that accepts a function, a number of times to unroll, and a
random-access range. The idea was to experiment with unroll and see how
it works for sums. I also wrote two baselines - a brute force one and
one that unrolls but is specialized for addition.
The results are interesting. First, partial unrolling does help a lot -
the brute force baseline takes 1.2s on my machine and unrollSum takes
only 0.2s.
Second, there is an issue with inlining: unroll takes 0.37s, almost
twice as much as the hand-unrolled version.
Third, it looks like larger unrolling limits is better - I only got a
plateau at 128!
Andrei
#!/usr/bin/env rdmd
import std.algorithm, std.conv, std.date, std.functional, std.range,
std.stdio, std.string, std.traits;
ElementType!R unroll(alias fun, size_t times, R)(R r) if
(isRandomAccessRange!R && hasLength!R)
{
static assert(times > 1 && !(times & (times - 1)));
static string repeat(string s, string indexPlaceholder, size_t n)
{
string r = "";
foreach (i; 0 .. n)
{
r ~= replace(s, indexPlaceholder, .to!string(i));
}
return r;
}
alias binaryFun!fun f;
typeof(return) result = 0;
immutable adjusted = r.length & ~cast(size_t) (times - 1);
size_t i = 0;
for (; i < adjusted; i += times)
{
ElementType!R[times - 1] temp = void;
mixin(repeat("temp[X] = f(r[i + (X + X)], r[i + (X + X +
1)]);", "X", times / 2));
mixin(repeat("temp[X + times / 2] = f(temp[X + X], temp[X + X +
1]);", "X", times / 2 - 1));
result = f(result, temp[$ - 1]);
}
for (; i != r.length; ++i)
{
result += r[i];
}
return result;
}
ElementType!R unrollSum(size_t times, R)(R r) if (isRandomAccessRange!R
&& hasLength!R)
{
static assert(times > 1 && !(times & (times - 1)));
static string repeat(string s, string indexPlaceholder, size_t n)
{
string r = "";
foreach (i; 0 .. n)
{
r ~= replace(s, indexPlaceholder, .to!string(i));
}
return r;
}
typeof(return) result = 0;
immutable adjusted = r.length & ~cast(size_t) (times - 1);
size_t i = 0;
for (; i < adjusted; i += times)
{
ElementType!R[times - 1] temp = void;
mixin(repeat("temp[X] = r[i + (X + X)] + r[i + (X + X + 1)];",
"X", times / 2));
mixin(repeat("temp[X + times / 2] = temp[X + X] + temp[X + X +
1];", "X", times / 2 - 1));
result += temp[$ - 1];
}
for (; i != r.length; ++i)
{
result += r[i];
}
return result;
}
ElementType!R sum0(R)(R r) if (isInputRange!R)
{
typeof(return) result = 0;
for (; !r.empty; r.popFront()) {
result += r.front;
}
return result;
}
void main(string[] args)
{
enum times = 8;
auto a = array(iota(100_000_001));
double t = getUTCtime();
if (args.length == 1) {
writeln("baseline");
writeln(sum0(a));
} else if (args[1] == "unroll") {
writeln(args[1]);
writeln(unroll!("a + b", times)(a));
} else if (args[1] == "unrollSum") {
writeln(args[1]);
writeln(unrollSum!(times)(a));
} else {
assert(0);
}
writeln((getUTCtime() - t) / ticksPerSecond);
}
More information about the Digitalmars-d
mailing list