Inlining Code Test
Iain Buclaw
ibuclaw at ubuntu.com
Sun Dec 12 15:57:04 PST 2010
== Quote from Craig Black (craigblack2 at cox.net)'s article
> The following program illustrates the problems with inlining in the dmd
> compiler. Perhaps with some more work I can reduce it to a smaller test
> case. I was playing around with a simple Array template, and noticed that
> similar C++ code performs much better. This is due, at least in part, to
> opIndex not being properly inlined by dmd. There are two sort functions,
> quickSort1 and quickSort2. quickSort1 indexes an Array data structure.
> quickSort2 indexes raw pointers. quickSort2 is roughly 20% faster on my
> core i7.
> import std.stdio;
> import std.date;
> import std.random;
> import std.conv;
> import std.algorithm;
> import std.range;
> struct Array(T)
> {
> public:
> this(int length) { resize(length); }
> this(T[] a) { append(a); }
> this(this)
> {
> if(!base.array) return;
> ArrayBase!T old;
> old = base;
> base.array = null;
> reserve(old.length(), old.length());
> copyData(old);
> old.array = null;
> }
> void clear() { base.clear(); }
> void resize(int sz)
> {
> assert(sz >= 0);
> if(sz == 0) return clear();
> if(!base.array) reserve(sz, sz);
> *base.lengthPtr() = sz;
> }
> void reserve(int capacity, int length)
> {
> assert(length <= capacity);
> if(capacity == 0) return clear();
> ArrayBase!T old;
> if(base.array)
> {
> if(base.capacity() >= capacity) return;
> old.array = base.array;
> base.array = null;
> }
> base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
> *base.lengthPtr() = length;
> *base.capacityPtr() = capacity;
> for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
> if(old.array) copyData(old);
> }
> int length() const { return base.length(); }
> int capacity() const { return base.capacity(); }
> bool empty() const { return base.array == null; }
> ref T at(int i)
> {
> assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~
> " out of bounds of empty array");
> assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
> to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
> return base.data()[i];
> }
> ref const T at(int i) const
> {
> assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~
> " out of bounds of empty array");
> assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
> to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
> return base.data()[i];
> }
> const ref T opIndex(int i) const { return at(i); }
> void opIndexAssign(T t, int i) { at(i) = t; }
> void opIndexAssign(ref T t, int i) { at(i) = t; }
> void opAssign(ref const Array!T array)
> {
> copy(array);
> }
> void opAssign(T[] array)
> {
> int len = array.length;
> resize(len);
> for(int i = 0; i < len; i++) at(i) = array[i];
> }
> void copy(ref const Array!T array)
> {
> if(array.empty()) return clear();
> int len = array.length();
> reserve(len, len);
> for(int i = 0; i < len; i++) at(i) = array[i];
> }
> void opOpAssign(string op, A...)(A a) if(op == "~")
> {
> appendComposite(a);
> }
> ref T front() { return at(0); }
> const ref T front() const { return at(0); }
> ref T back() { return at(length()-1); }
> const ref T back() const { return at(length()-1); }
> ref T appendOne()
> {
> int sz = length();
> int sp = capacity();
> if(sp > sz) (*base.lengthPtr())++;
> else
> {
> sz++;
> sp = max(2, sp+sp/2, sz);
> reserve(sp, sz);
> }
> return back();
> }
> void append(A...)(A a)
> {
> static if(a.length == 1 && (is(typeof(a[0]) == Array!T) ||
> is(typeof(a[0]) == T[])))
> {
> appendComposite(a[0]);
> } else {
> appendTuple(a);
> }
> }
> void appendTuple(A...)(A a)
> {
> foreach(x; a) appendOne() = x;
> }
> void appendComposite(A)(A a)
> {
> int prevLen = length();
> resize(prevLen + a.length);
> for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
> }
> private:
> static struct ArrayBase(T)
> {
> public:
> ~this() { clear(); }
> void clear() { if(array) delete array; }
> int length() const { return array ? *lengthPtr() : 0; }
> int capacity() const { return array ? *capacityPtr() : 0; }
> int* lengthPtr() const { return cast(int*)(array); }
> int* capacityPtr() const { return cast(int*)(array+4); }
> T* data() const { return cast(T*)(array+8); }
> ubyte* array;
> }
> ArrayBase!T base;
> void copyData(ref ArrayBase!T array)
> {
> int copyLen = min(length, array.length());
> for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
> }
> }
> static bool less(T)(T a, T b) { return a < b; }
> void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high)
> {
> for(int i = low; i <= high; i++)
> {
> int min = i;
> for(int j = i + 1; j <= high; j++)
> if(L(a[j], a[min])) min = j;
> swap(a[i], a[min]);
> }
> }
> int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
> {
> T x = a[r];
> int j = p - 1;
> for (int i = p; i < r; i++)
> {
> if (L(x, a[i])) continue;
> swap(a[i], a[++j]);
> }
> a[r] = a[j + 1];
> a[j + 1] = x;
> return j + 1;
> }
> void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
> {
> if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
> if (p < r)
> {
> int q = partition1!(T, L)(a, p, r);
> quickSort1!(T, L)(a, p, q - 1);
> quickSort1!(T, L)(a, q + 1, r);
> }
> }
> void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
> a.length()-1); }
> void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
> {
> for(int i = low; i <= high; i++)
> {
> int min = i;
> for(int j = i + 1; j <= high; j++)
> if(L(a[j], a[min])) min = j;
> swap(a[i], a[min]);
> }
> }
> int partition2(T, alias L = less!T)(T *a, int p, int r)
> {
> T x = a[r];
> int j = p - 1;
> for (int i = p; i < r; i++)
> {
> if (L(x, a[i])) continue;
> swap(a[i], a[++j]);
> }
> a[r] = a[j + 1];
> a[j + 1] = x;
> return j + 1;
> }
> void quickSort2(T, alias L = less!T)(T *a, int p, int r)
> {
> if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
> if (p < r)
> {
> int q = partition2!(T, L)(a, p, r);
> quickSort2!(T, L)(a, p, q - 1);
> quickSort2!(T, L)(a, q + 1, r);
> }
> }
> void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0,
> length-1); }
> double[] vals;
> void bench1()
> {
> Array!double v;
> for(int i = 0; i < 100; i++)
> {
> v = vals;
> sort1(v);
> }
> }
> void bench2()
> {
> Array!double v;
> for(int i = 0; i < 100; i++)
> {
> v = vals;
> sort2(&v[0], v.length);
> }
> }
> void main()
> {
> Mt19937 gen;
> vals.length = 1000;
> for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
> ulong[] times = [0, 0];
> for(int i = 0; i < 100; i++)
> {
> auto times2 = benchmark!(bench1, bench2)(1);
> times[0] += times2[0];
> times[1] += times2[1];
> }
> writeln("Sorting with Array.opIndex: ", times[0]);
> writeln("Sorting with pointers: ", times[1]);
> writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
> }
Testing on GDC with a Netbook, results from 3 consecutive runs are:
Without -frelease
-------
Sorting with Array.opIndex: 27981
Sorting with pointers: 5602
79.9793 percent faster
Sorting with Array.opIndex: 25565
Sorting with pointers: 5179
79.7418 percent faster
Sorting with Array.opIndex: 28657
Sorting with pointers: 5772
79.8583 percent faster
-------
With -frelease
-------
Sorting with Array.opIndex: 10591
Sorting with pointers: 4771
54.9523 percent faster
Sorting with Array.opIndex: 10289
Sorting with pointers: 4710
54.223 percent faster
Sorting with Array.opIndex: 11305
Sorting with pointers: 5216
53.8611 percent faster
-------
With -frelease -fno-bounds-check
-------
Sorting with Array.opIndex: 11651
Sorting with pointers: 5236
55.0597 percent faster
Sorting with Array.opIndex: 9873
Sorting with pointers: 4559
53.8236 percent faster
Sorting with Array.opIndex: 10361
Sorting with pointers: 4745
54.2033 percent faster
-------
GDC doesn't use DMD's FE inliner, but results from the GCC backend's inliner:
-------
Considering inline candidate check.
Inlining check into fillUp.
Merging blocks 9 and 10
Merging blocks 9 and 11
Considering inline candidate initialize.
Inlining initialize into emplace.
Merging blocks 2 and 3
Merging blocks 2 and 4
Considering inline candidate bench2.
Not inlining: code size would grow by 77 insns.
Considering inline candidate bench1.
Not inlining: code size would grow by 49 insns.
-------
So there's _me_ seriously doubting that inlining has anything to do with the 50% increase.
Regards
Iain
More information about the Digitalmars-d
mailing list