Inlining Code Test
Andrej Mitrovic
andrej.mitrovich at gmail.com
Sun Dec 12 16:09:43 PST 2010
My crappy old Pentium 4 has gone totally mad:
Sorting with Array.opIndex: 45080
Sorting with pointers: 45608
4.092e+16 percent faster (??)
Compiled with dmd -profile -O -release -inline -noboundscheck
On 12/13/10, Iain Buclaw <ibuclaw at ubuntu.com> wrote:
> == Quote from Craig Black (craigblack2 at cox.net)'s article
>> The following program illustrates the problems with inlining in the dmd
>> compiler. Perhaps with some more work I can reduce it to a smaller test
>> case. I was playing around with a simple Array template, and noticed that
>> similar C++ code performs much better. This is due, at least in part, to
>> opIndex not being properly inlined by dmd. There are two sort functions,
>> quickSort1 and quickSort2. quickSort1 indexes an Array data structure.
>> quickSort2 indexes raw pointers. quickSort2 is roughly 20% faster on my
>> core i7.
>> import std.stdio;
>> import std.date;
>> import std.random;
>> import std.conv;
>> import std.algorithm;
>> import std.range;
>> struct Array(T)
>> {
>> public:
>> this(int length) { resize(length); }
>> this(T[] a) { append(a); }
>> this(this)
>> {
>> if(!base.array) return;
>> ArrayBase!T old;
>> old = base;
>> base.array = null;
>> reserve(old.length(), old.length());
>> copyData(old);
>> old.array = null;
>> }
>> void clear() { base.clear(); }
>> void resize(int sz)
>> {
>> assert(sz >= 0);
>> if(sz == 0) return clear();
>> if(!base.array) reserve(sz, sz);
>> *base.lengthPtr() = sz;
>> }
>> void reserve(int capacity, int length)
>> {
>> assert(length <= capacity);
>> if(capacity == 0) return clear();
>> ArrayBase!T old;
>> if(base.array)
>> {
>> if(base.capacity() >= capacity) return;
>> old.array = base.array;
>> base.array = null;
>> }
>> base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
>> *base.lengthPtr() = length;
>> *base.capacityPtr() = capacity;
>> for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
>> if(old.array) copyData(old);
>> }
>> int length() const { return base.length(); }
>> int capacity() const { return base.capacity(); }
>> bool empty() const { return base.array == null; }
>> ref T at(int i)
>> {
>> assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i)
>> ~
>> " out of bounds of empty array");
>> assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
>> to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
>> return base.data()[i];
>> }
>> ref const T at(int i) const
>> {
>> assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i)
>> ~
>> " out of bounds of empty array");
>> assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
>> to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
>> return base.data()[i];
>> }
>> const ref T opIndex(int i) const { return at(i); }
>> void opIndexAssign(T t, int i) { at(i) = t; }
>> void opIndexAssign(ref T t, int i) { at(i) = t; }
>> void opAssign(ref const Array!T array)
>> {
>> copy(array);
>> }
>> void opAssign(T[] array)
>> {
>> int len = array.length;
>> resize(len);
>> for(int i = 0; i < len; i++) at(i) = array[i];
>> }
>> void copy(ref const Array!T array)
>> {
>> if(array.empty()) return clear();
>> int len = array.length();
>> reserve(len, len);
>> for(int i = 0; i < len; i++) at(i) = array[i];
>> }
>> void opOpAssign(string op, A...)(A a) if(op == "~")
>> {
>> appendComposite(a);
>> }
>> ref T front() { return at(0); }
>> const ref T front() const { return at(0); }
>> ref T back() { return at(length()-1); }
>> const ref T back() const { return at(length()-1); }
>> ref T appendOne()
>> {
>> int sz = length();
>> int sp = capacity();
>> if(sp > sz) (*base.lengthPtr())++;
>> else
>> {
>> sz++;
>> sp = max(2, sp+sp/2, sz);
>> reserve(sp, sz);
>> }
>> return back();
>> }
>> void append(A...)(A a)
>> {
>> static if(a.length == 1 && (is(typeof(a[0]) == Array!T) ||
>> is(typeof(a[0]) == T[])))
>> {
>> appendComposite(a[0]);
>> } else {
>> appendTuple(a);
>> }
>> }
>> void appendTuple(A...)(A a)
>> {
>> foreach(x; a) appendOne() = x;
>> }
>> void appendComposite(A)(A a)
>> {
>> int prevLen = length();
>> resize(prevLen + a.length);
>> for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
>> }
>> private:
>> static struct ArrayBase(T)
>> {
>> public:
>> ~this() { clear(); }
>> void clear() { if(array) delete array; }
>> int length() const { return array ? *lengthPtr() : 0; }
>> int capacity() const { return array ? *capacityPtr() : 0; }
>> int* lengthPtr() const { return cast(int*)(array); }
>> int* capacityPtr() const { return cast(int*)(array+4); }
>> T* data() const { return cast(T*)(array+8); }
>> ubyte* array;
>> }
>> ArrayBase!T base;
>> void copyData(ref ArrayBase!T array)
>> {
>> int copyLen = min(length, array.length());
>> for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
>> }
>> }
>> static bool less(T)(T a, T b) { return a < b; }
>> void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high)
>> {
>> for(int i = low; i <= high; i++)
>> {
>> int min = i;
>> for(int j = i + 1; j <= high; j++)
>> if(L(a[j], a[min])) min = j;
>> swap(a[i], a[min]);
>> }
>> }
>> int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
>> {
>> T x = a[r];
>> int j = p - 1;
>> for (int i = p; i < r; i++)
>> {
>> if (L(x, a[i])) continue;
>> swap(a[i], a[++j]);
>> }
>> a[r] = a[j + 1];
>> a[j + 1] = x;
>> return j + 1;
>> }
>> void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
>> {
>> if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
>> if (p < r)
>> {
>> int q = partition1!(T, L)(a, p, r);
>> quickSort1!(T, L)(a, p, q - 1);
>> quickSort1!(T, L)(a, q + 1, r);
>> }
>> }
>> void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
>> a.length()-1); }
>> void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
>> {
>> for(int i = low; i <= high; i++)
>> {
>> int min = i;
>> for(int j = i + 1; j <= high; j++)
>> if(L(a[j], a[min])) min = j;
>> swap(a[i], a[min]);
>> }
>> }
>> int partition2(T, alias L = less!T)(T *a, int p, int r)
>> {
>> T x = a[r];
>> int j = p - 1;
>> for (int i = p; i < r; i++)
>> {
>> if (L(x, a[i])) continue;
>> swap(a[i], a[++j]);
>> }
>> a[r] = a[j + 1];
>> a[j + 1] = x;
>> return j + 1;
>> }
>> void quickSort2(T, alias L = less!T)(T *a, int p, int r)
>> {
>> if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
>> if (p < r)
>> {
>> int q = partition2!(T, L)(a, p, r);
>> quickSort2!(T, L)(a, p, q - 1);
>> quickSort2!(T, L)(a, q + 1, r);
>> }
>> }
>> void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a,
>> 0,
>> length-1); }
>> double[] vals;
>> void bench1()
>> {
>> Array!double v;
>> for(int i = 0; i < 100; i++)
>> {
>> v = vals;
>> sort1(v);
>> }
>> }
>> void bench2()
>> {
>> Array!double v;
>> for(int i = 0; i < 100; i++)
>> {
>> v = vals;
>> sort2(&v[0], v.length);
>> }
>> }
>> void main()
>> {
>> Mt19937 gen;
>> vals.length = 1000;
>> for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
>> ulong[] times = [0, 0];
>> for(int i = 0; i < 100; i++)
>> {
>> auto times2 = benchmark!(bench1, bench2)(1);
>> times[0] += times2[0];
>> times[1] += times2[1];
>> }
>> writeln("Sorting with Array.opIndex: ", times[0]);
>> writeln("Sorting with pointers: ", times[1]);
>> writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
>> }
>
> Testing on GDC with a Netbook, results from 3 consecutive runs are:
>
> Without -frelease
> -------
> Sorting with Array.opIndex: 27981
> Sorting with pointers: 5602
> 79.9793 percent faster
>
> Sorting with Array.opIndex: 25565
> Sorting with pointers: 5179
> 79.7418 percent faster
>
> Sorting with Array.opIndex: 28657
> Sorting with pointers: 5772
> 79.8583 percent faster
> -------
>
>
> With -frelease
> -------
> Sorting with Array.opIndex: 10591
> Sorting with pointers: 4771
> 54.9523 percent faster
>
> Sorting with Array.opIndex: 10289
> Sorting with pointers: 4710
> 54.223 percent faster
>
> Sorting with Array.opIndex: 11305
> Sorting with pointers: 5216
> 53.8611 percent faster
> -------
>
>
> With -frelease -fno-bounds-check
> -------
> Sorting with Array.opIndex: 11651
> Sorting with pointers: 5236
> 55.0597 percent faster
>
> Sorting with Array.opIndex: 9873
> Sorting with pointers: 4559
> 53.8236 percent faster
>
> Sorting with Array.opIndex: 10361
> Sorting with pointers: 4745
> 54.2033 percent faster
> -------
>
>
> GDC doesn't use DMD's FE inliner, but results from the GCC backend's
> inliner:
> -------
> Considering inline candidate check.
> Inlining check into fillUp.
> Merging blocks 9 and 10
> Merging blocks 9 and 11
>
> Considering inline candidate initialize.
> Inlining initialize into emplace.
> Merging blocks 2 and 3
> Merging blocks 2 and 4
>
> Considering inline candidate bench2.
> Not inlining: code size would grow by 77 insns.
> Considering inline candidate bench1.
> Not inlining: code size would grow by 49 insns.
> -------
>
>
> So there's _me_ seriously doubting that inlining has anything to do with the
> 50% increase.
>
> Regards
> Iain
>
More information about the Digitalmars-d
mailing list