Inlining Code Test
Craig Black
craigblack2 at cox.net
Sun Dec 12 20:35:19 PST 2010
Try it without -profile. That tends to slow everything down to a halt.
-Craig
"Andrej Mitrovic" <andrej.mitrovich at gmail.com> wrote in message
news:mailman.948.1292198993.21107.digitalmars-d at puremagic.com...
> My crappy old Pentium 4 has gone totally mad:
>
> Sorting with Array.opIndex: 45080
> Sorting with pointers: 45608
> 4.092e+16 percent faster (??)
>
> Compiled with dmd -profile -O -release -inline -noboundscheck
>
> On 12/13/10, Iain Buclaw <ibuclaw at ubuntu.com> wrote:
>> == Quote from Craig Black (craigblack2 at cox.net)'s article
>>> The following program illustrates the problems with inlining in the dmd
>>> compiler. Perhaps with some more work I can reduce it to a smaller test
>>> case. I was playing around with a simple Array template, and noticed
>>> that
>>> similar C++ code performs much better. This is due, at least in part,
>>> to
>>> opIndex not being properly inlined by dmd. There are two sort
>>> functions,
>>> quickSort1 and quickSort2. quickSort1 indexes an Array data structure.
>>> quickSort2 indexes raw pointers. quickSort2 is roughly 20% faster on my
>>> core i7.
>>> import std.stdio;
>>> import std.date;
>>> import std.random;
>>> import std.conv;
>>> import std.algorithm;
>>> import std.range;
>>> struct Array(T)
>>> {
>>> public:
>>> this(int length) { resize(length); }
>>> this(T[] a) { append(a); }
>>> this(this)
>>> {
>>> if(!base.array) return;
>>> ArrayBase!T old;
>>> old = base;
>>> base.array = null;
>>> reserve(old.length(), old.length());
>>> copyData(old);
>>> old.array = null;
>>> }
>>> void clear() { base.clear(); }
>>> void resize(int sz)
>>> {
>>> assert(sz >= 0);
>>> if(sz == 0) return clear();
>>> if(!base.array) reserve(sz, sz);
>>> *base.lengthPtr() = sz;
>>> }
>>> void reserve(int capacity, int length)
>>> {
>>> assert(length <= capacity);
>>> if(capacity == 0) return clear();
>>> ArrayBase!T old;
>>> if(base.array)
>>> {
>>> if(base.capacity() >= capacity) return;
>>> old.array = base.array;
>>> base.array = null;
>>> }
>>> base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
>>> *base.lengthPtr() = length;
>>> *base.capacityPtr() = capacity;
>>> for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
>>> if(old.array) copyData(old);
>>> }
>>> int length() const { return base.length(); }
>>> int capacity() const { return base.capacity(); }
>>> bool empty() const { return base.array == null; }
>>> ref T at(int i)
>>> {
>>> assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~
>>> to!string(i)
>>> ~
>>> " out of bounds of empty array");
>>> assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index "
>>> ~
>>> to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
>>> return base.data()[i];
>>> }
>>> ref const T at(int i) const
>>> {
>>> assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~
>>> to!string(i)
>>> ~
>>> " out of bounds of empty array");
>>> assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index "
>>> ~
>>> to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
>>> return base.data()[i];
>>> }
>>> const ref T opIndex(int i) const { return at(i); }
>>> void opIndexAssign(T t, int i) { at(i) = t; }
>>> void opIndexAssign(ref T t, int i) { at(i) = t; }
>>> void opAssign(ref const Array!T array)
>>> {
>>> copy(array);
>>> }
>>> void opAssign(T[] array)
>>> {
>>> int len = array.length;
>>> resize(len);
>>> for(int i = 0; i < len; i++) at(i) = array[i];
>>> }
>>> void copy(ref const Array!T array)
>>> {
>>> if(array.empty()) return clear();
>>> int len = array.length();
>>> reserve(len, len);
>>> for(int i = 0; i < len; i++) at(i) = array[i];
>>> }
>>> void opOpAssign(string op, A...)(A a) if(op == "~")
>>> {
>>> appendComposite(a);
>>> }
>>> ref T front() { return at(0); }
>>> const ref T front() const { return at(0); }
>>> ref T back() { return at(length()-1); }
>>> const ref T back() const { return at(length()-1); }
>>> ref T appendOne()
>>> {
>>> int sz = length();
>>> int sp = capacity();
>>> if(sp > sz) (*base.lengthPtr())++;
>>> else
>>> {
>>> sz++;
>>> sp = max(2, sp+sp/2, sz);
>>> reserve(sp, sz);
>>> }
>>> return back();
>>> }
>>> void append(A...)(A a)
>>> {
>>> static if(a.length == 1 && (is(typeof(a[0]) == Array!T) ||
>>> is(typeof(a[0]) == T[])))
>>> {
>>> appendComposite(a[0]);
>>> } else {
>>> appendTuple(a);
>>> }
>>> }
>>> void appendTuple(A...)(A a)
>>> {
>>> foreach(x; a) appendOne() = x;
>>> }
>>> void appendComposite(A)(A a)
>>> {
>>> int prevLen = length();
>>> resize(prevLen + a.length);
>>> for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
>>> }
>>> private:
>>> static struct ArrayBase(T)
>>> {
>>> public:
>>> ~this() { clear(); }
>>> void clear() { if(array) delete array; }
>>> int length() const { return array ? *lengthPtr() : 0; }
>>> int capacity() const { return array ? *capacityPtr() : 0; }
>>> int* lengthPtr() const { return cast(int*)(array); }
>>> int* capacityPtr() const { return cast(int*)(array+4); }
>>> T* data() const { return cast(T*)(array+8); }
>>> ubyte* array;
>>> }
>>> ArrayBase!T base;
>>> void copyData(ref ArrayBase!T array)
>>> {
>>> int copyLen = min(length, array.length());
>>> for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
>>> }
>>> }
>>> static bool less(T)(T a, T b) { return a < b; }
>>> void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int
>>> high)
>>> {
>>> for(int i = low; i <= high; i++)
>>> {
>>> int min = i;
>>> for(int j = i + 1; j <= high; j++)
>>> if(L(a[j], a[min])) min = j;
>>> swap(a[i], a[min]);
>>> }
>>> }
>>> int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
>>> {
>>> T x = a[r];
>>> int j = p - 1;
>>> for (int i = p; i < r; i++)
>>> {
>>> if (L(x, a[i])) continue;
>>> swap(a[i], a[++j]);
>>> }
>>> a[r] = a[j + 1];
>>> a[j + 1] = x;
>>> return j + 1;
>>> }
>>> void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
>>> {
>>> if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
>>> if (p < r)
>>> {
>>> int q = partition1!(T, L)(a, p, r);
>>> quickSort1!(T, L)(a, p, q - 1);
>>> quickSort1!(T, L)(a, q + 1, r);
>>> }
>>> }
>>> void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
>>> a.length()-1); }
>>> void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
>>> {
>>> for(int i = low; i <= high; i++)
>>> {
>>> int min = i;
>>> for(int j = i + 1; j <= high; j++)
>>> if(L(a[j], a[min])) min = j;
>>> swap(a[i], a[min]);
>>> }
>>> }
>>> int partition2(T, alias L = less!T)(T *a, int p, int r)
>>> {
>>> T x = a[r];
>>> int j = p - 1;
>>> for (int i = p; i < r; i++)
>>> {
>>> if (L(x, a[i])) continue;
>>> swap(a[i], a[++j]);
>>> }
>>> a[r] = a[j + 1];
>>> a[j + 1] = x;
>>> return j + 1;
>>> }
>>> void quickSort2(T, alias L = less!T)(T *a, int p, int r)
>>> {
>>> if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
>>> if (p < r)
>>> {
>>> int q = partition2!(T, L)(a, p, r);
>>> quickSort2!(T, L)(a, p, q - 1);
>>> quickSort2!(T, L)(a, q + 1, r);
>>> }
>>> }
>>> void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a,
>>> 0,
>>> length-1); }
>>> double[] vals;
>>> void bench1()
>>> {
>>> Array!double v;
>>> for(int i = 0; i < 100; i++)
>>> {
>>> v = vals;
>>> sort1(v);
>>> }
>>> }
>>> void bench2()
>>> {
>>> Array!double v;
>>> for(int i = 0; i < 100; i++)
>>> {
>>> v = vals;
>>> sort2(&v[0], v.length);
>>> }
>>> }
>>> void main()
>>> {
>>> Mt19937 gen;
>>> vals.length = 1000;
>>> for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
>>> ulong[] times = [0, 0];
>>> for(int i = 0; i < 100; i++)
>>> {
>>> auto times2 = benchmark!(bench1, bench2)(1);
>>> times[0] += times2[0];
>>> times[1] += times2[1];
>>> }
>>> writeln("Sorting with Array.opIndex: ", times[0]);
>>> writeln("Sorting with pointers: ", times[1]);
>>> writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
>>> }
>>
>> Testing on GDC with a Netbook, results from 3 consecutive runs are:
>>
>> Without -frelease
>> -------
>> Sorting with Array.opIndex: 27981
>> Sorting with pointers: 5602
>> 79.9793 percent faster
>>
>> Sorting with Array.opIndex: 25565
>> Sorting with pointers: 5179
>> 79.7418 percent faster
>>
>> Sorting with Array.opIndex: 28657
>> Sorting with pointers: 5772
>> 79.8583 percent faster
>> -------
>>
>>
>> With -frelease
>> -------
>> Sorting with Array.opIndex: 10591
>> Sorting with pointers: 4771
>> 54.9523 percent faster
>>
>> Sorting with Array.opIndex: 10289
>> Sorting with pointers: 4710
>> 54.223 percent faster
>>
>> Sorting with Array.opIndex: 11305
>> Sorting with pointers: 5216
>> 53.8611 percent faster
>> -------
>>
>>
>> With -frelease -fno-bounds-check
>> -------
>> Sorting with Array.opIndex: 11651
>> Sorting with pointers: 5236
>> 55.0597 percent faster
>>
>> Sorting with Array.opIndex: 9873
>> Sorting with pointers: 4559
>> 53.8236 percent faster
>>
>> Sorting with Array.opIndex: 10361
>> Sorting with pointers: 4745
>> 54.2033 percent faster
>> -------
>>
>>
>> GDC doesn't use DMD's FE inliner, but results from the GCC backend's
>> inliner:
>> -------
>> Considering inline candidate check.
>> Inlining check into fillUp.
>> Merging blocks 9 and 10
>> Merging blocks 9 and 11
>>
>> Considering inline candidate initialize.
>> Inlining initialize into emplace.
>> Merging blocks 2 and 3
>> Merging blocks 2 and 4
>>
>> Considering inline candidate bench2.
>> Not inlining: code size would grow by 77 insns.
>> Considering inline candidate bench1.
>> Not inlining: code size would grow by 49 insns.
>> -------
>>
>>
>> So there's _me_ seriously doubting that inlining has anything to do with
>> the
>> 50% increase.
>>
>> Regards
>> Iain
>>
More information about the Digitalmars-d
mailing list