A few experiments with partial unrolling

Andrei Alexandrescu SeeWebsiteForEmail at erdani.org
Sat Dec 25 12:54:44 PST 2010


I was looking at http://d.puremagic.com/issues/show_bug.cgi?id=4725 and 
thought I'd experiment a bit with a generic unrolling routine (code at 
the end of this). So I wrote a general parameterized unroll function 
that accepts a function, a number of times to unroll, and a 
random-access range. The idea was to experiment with unroll and see how 
it works for sums. I also wrote two baselines - a brute force one and 
one that unrolls but is specialized for addition.

The results are interesting. First, partial unrolling does help a lot - 
the brute force baseline takes 1.2s on my machine and unrollSum takes 
only 0.2s.

Second, there is an issue with inlining: unroll takes 0.37s, almost 
twice as much as the hand-unrolled version.

Third, it looks like larger unrolling limits is better - I only got a 
plateau at 128!


Andrei

#!/usr/bin/env rdmd
import std.algorithm, std.conv, std.date, std.functional, std.range, 
std.stdio, std.string, std.traits;

ElementType!R unroll(alias fun, size_t times, R)(R r) if 
(isRandomAccessRange!R && hasLength!R)
{
     static assert(times > 1 && !(times & (times - 1)));

     static string repeat(string s, string indexPlaceholder, size_t n)
     {
         string r = "";
         foreach (i; 0 .. n)
         {
             r ~= replace(s, indexPlaceholder, .to!string(i));
         }
         return r;
     }

     alias binaryFun!fun f;
     typeof(return) result = 0;
     immutable adjusted = r.length & ~cast(size_t) (times - 1);
     size_t i = 0;
     for (; i < adjusted; i += times)
     {
         ElementType!R[times - 1] temp = void;
         mixin(repeat("temp[X] = f(r[i + (X + X)], r[i + (X + X + 
1)]);", "X", times / 2));
         mixin(repeat("temp[X + times / 2] = f(temp[X + X], temp[X + X + 
1]);", "X", times / 2 - 1));
         result = f(result, temp[$ - 1]);
     }
     for (; i != r.length; ++i)
     {
         result += r[i];
     }
     return result;
}

ElementType!R unrollSum(size_t times, R)(R r) if (isRandomAccessRange!R 
&& hasLength!R)
{
     static assert(times > 1 && !(times & (times - 1)));

     static string repeat(string s, string indexPlaceholder, size_t n)
     {
         string r = "";
         foreach (i; 0 .. n)
         {
             r ~= replace(s, indexPlaceholder, .to!string(i));
         }
         return r;
     }

     typeof(return) result = 0;
     immutable adjusted = r.length & ~cast(size_t) (times - 1);
     size_t i = 0;
     for (; i < adjusted; i += times)
     {
         ElementType!R[times - 1] temp = void;
         mixin(repeat("temp[X] = r[i + (X + X)] + r[i + (X + X + 1)];", 
"X", times / 2));
         mixin(repeat("temp[X + times / 2] = temp[X + X] + temp[X + X + 
1];", "X", times / 2 - 1));
         result += temp[$ - 1];
     }
     for (; i != r.length; ++i)
     {
         result += r[i];
     }
     return result;
}

ElementType!R sum0(R)(R r) if (isInputRange!R)
{
     typeof(return) result = 0;
     for (; !r.empty; r.popFront()) {
         result += r.front;
     }
     return result;
}

void main(string[] args)
{
     enum times = 8;
     auto a = array(iota(100_000_001));
     double t = getUTCtime();
     if (args.length == 1) {
         writeln("baseline");
         writeln(sum0(a));
     } else if (args[1] == "unroll") {
         writeln(args[1]);
         writeln(unroll!("a + b", times)(a));
     } else if (args[1] == "unrollSum") {
         writeln(args[1]);
         writeln(unrollSum!(times)(a));
     } else {
         assert(0);
     }
     writeln((getUTCtime() - t) / ticksPerSecond);
}


More information about the Digitalmars-d mailing list