Inlining Code Test
Andrej Mitrovic
andrej.mitrovich at gmail.com
Sun Dec 12 20:47:34 PST 2010
Ye that's better:
Sorting with Array.opIndex: 2859
Sorting with pointers: 1765
38.2651 percent faster
And it only took 2 seconds. With profile it takes a full minute.
On 12/13/10, Craig Black <craigblack2 at cox.net> wrote:
> Try it without -profile. That tends to slow everything down to a halt.
>
> -Craig
>
> "Andrej Mitrovic" <andrej.mitrovich at gmail.com> wrote in message
> news:mailman.948.1292198993.21107.digitalmars-d at puremagic.com...
>> My crappy old Pentium 4 has gone totally mad:
>>
>> Sorting with Array.opIndex: 45080
>> Sorting with pointers: 45608
>> 4.092e+16 percent faster (??)
>>
>> Compiled with dmd -profile -O -release -inline -noboundscheck
>>
>> On 12/13/10, Iain Buclaw <ibuclaw at ubuntu.com> wrote:
>>> == Quote from Craig Black (craigblack2 at cox.net)'s article
>>>> The following program illustrates the problems with inlining in the dmd
>>>> compiler. Perhaps with some more work I can reduce it to a smaller test
>>>> case. I was playing around with a simple Array template, and noticed
>>>> that
>>>> similar C++ code performs much better. This is due, at least in part,
>>>> to
>>>> opIndex not being properly inlined by dmd. There are two sort
>>>> functions,
>>>> quickSort1 and quickSort2. quickSort1 indexes an Array data structure.
>>>> quickSort2 indexes raw pointers. quickSort2 is roughly 20% faster on my
>>>> core i7.
>>>> import std.stdio;
>>>> import std.date;
>>>> import std.random;
>>>> import std.conv;
>>>> import std.algorithm;
>>>> import std.range;
>>>> struct Array(T)
>>>> {
>>>> public:
>>>> this(int length) { resize(length); }
>>>> this(T[] a) { append(a); }
>>>> this(this)
>>>> {
>>>> if(!base.array) return;
>>>> ArrayBase!T old;
>>>> old = base;
>>>> base.array = null;
>>>> reserve(old.length(), old.length());
>>>> copyData(old);
>>>> old.array = null;
>>>> }
>>>> void clear() { base.clear(); }
>>>> void resize(int sz)
>>>> {
>>>> assert(sz >= 0);
>>>> if(sz == 0) return clear();
>>>> if(!base.array) reserve(sz, sz);
>>>> *base.lengthPtr() = sz;
>>>> }
>>>> void reserve(int capacity, int length)
>>>> {
>>>> assert(length <= capacity);
>>>> if(capacity == 0) return clear();
>>>> ArrayBase!T old;
>>>> if(base.array)
>>>> {
>>>> if(base.capacity() >= capacity) return;
>>>> old.array = base.array;
>>>> base.array = null;
>>>> }
>>>> base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
>>>> *base.lengthPtr() = length;
>>>> *base.capacityPtr() = capacity;
>>>> for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
>>>> if(old.array) copyData(old);
>>>> }
>>>> int length() const { return base.length(); }
>>>> int capacity() const { return base.capacity(); }
>>>> bool empty() const { return base.array == null; }
>>>> ref T at(int i)
>>>> {
>>>> assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~
>>>> to!string(i)
>>>> ~
>>>> " out of bounds of empty array");
>>>> assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index "
>>>>
>>>> ~
>>>> to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
>>>> return base.data()[i];
>>>> }
>>>> ref const T at(int i) const
>>>> {
>>>> assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~
>>>> to!string(i)
>>>> ~
>>>> " out of bounds of empty array");
>>>> assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index "
>>>>
>>>> ~
>>>> to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
>>>> return base.data()[i];
>>>> }
>>>> const ref T opIndex(int i) const { return at(i); }
>>>> void opIndexAssign(T t, int i) { at(i) = t; }
>>>> void opIndexAssign(ref T t, int i) { at(i) = t; }
>>>> void opAssign(ref const Array!T array)
>>>> {
>>>> copy(array);
>>>> }
>>>> void opAssign(T[] array)
>>>> {
>>>> int len = array.length;
>>>> resize(len);
>>>> for(int i = 0; i < len; i++) at(i) = array[i];
>>>> }
>>>> void copy(ref const Array!T array)
>>>> {
>>>> if(array.empty()) return clear();
>>>> int len = array.length();
>>>> reserve(len, len);
>>>> for(int i = 0; i < len; i++) at(i) = array[i];
>>>> }
>>>> void opOpAssign(string op, A...)(A a) if(op == "~")
>>>> {
>>>> appendComposite(a);
>>>> }
>>>> ref T front() { return at(0); }
>>>> const ref T front() const { return at(0); }
>>>> ref T back() { return at(length()-1); }
>>>> const ref T back() const { return at(length()-1); }
>>>> ref T appendOne()
>>>> {
>>>> int sz = length();
>>>> int sp = capacity();
>>>> if(sp > sz) (*base.lengthPtr())++;
>>>> else
>>>> {
>>>> sz++;
>>>> sp = max(2, sp+sp/2, sz);
>>>> reserve(sp, sz);
>>>> }
>>>> return back();
>>>> }
>>>> void append(A...)(A a)
>>>> {
>>>> static if(a.length == 1 && (is(typeof(a[0]) == Array!T) ||
>>>> is(typeof(a[0]) == T[])))
>>>> {
>>>> appendComposite(a[0]);
>>>> } else {
>>>> appendTuple(a);
>>>> }
>>>> }
>>>> void appendTuple(A...)(A a)
>>>> {
>>>> foreach(x; a) appendOne() = x;
>>>> }
>>>> void appendComposite(A)(A a)
>>>> {
>>>> int prevLen = length();
>>>> resize(prevLen + a.length);
>>>> for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
>>>> }
>>>> private:
>>>> static struct ArrayBase(T)
>>>> {
>>>> public:
>>>> ~this() { clear(); }
>>>> void clear() { if(array) delete array; }
>>>> int length() const { return array ? *lengthPtr() : 0; }
>>>> int capacity() const { return array ? *capacityPtr() : 0; }
>>>> int* lengthPtr() const { return cast(int*)(array); }
>>>> int* capacityPtr() const { return cast(int*)(array+4); }
>>>> T* data() const { return cast(T*)(array+8); }
>>>> ubyte* array;
>>>> }
>>>> ArrayBase!T base;
>>>> void copyData(ref ArrayBase!T array)
>>>> {
>>>> int copyLen = min(length, array.length());
>>>> for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
>>>> }
>>>> }
>>>> static bool less(T)(T a, T b) { return a < b; }
>>>> void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int
>>>> high)
>>>> {
>>>> for(int i = low; i <= high; i++)
>>>> {
>>>> int min = i;
>>>> for(int j = i + 1; j <= high; j++)
>>>> if(L(a[j], a[min])) min = j;
>>>> swap(a[i], a[min]);
>>>> }
>>>> }
>>>> int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
>>>> {
>>>> T x = a[r];
>>>> int j = p - 1;
>>>> for (int i = p; i < r; i++)
>>>> {
>>>> if (L(x, a[i])) continue;
>>>> swap(a[i], a[++j]);
>>>> }
>>>> a[r] = a[j + 1];
>>>> a[j + 1] = x;
>>>> return j + 1;
>>>> }
>>>> void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
>>>> {
>>>> if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
>>>> if (p < r)
>>>> {
>>>> int q = partition1!(T, L)(a, p, r);
>>>> quickSort1!(T, L)(a, p, q - 1);
>>>> quickSort1!(T, L)(a, q + 1, r);
>>>> }
>>>> }
>>>> void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
>>>> a.length()-1); }
>>>> void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
>>>> {
>>>> for(int i = low; i <= high; i++)
>>>> {
>>>> int min = i;
>>>> for(int j = i + 1; j <= high; j++)
>>>> if(L(a[j], a[min])) min = j;
>>>> swap(a[i], a[min]);
>>>> }
>>>> }
>>>> int partition2(T, alias L = less!T)(T *a, int p, int r)
>>>> {
>>>> T x = a[r];
>>>> int j = p - 1;
>>>> for (int i = p; i < r; i++)
>>>> {
>>>> if (L(x, a[i])) continue;
>>>> swap(a[i], a[++j]);
>>>> }
>>>> a[r] = a[j + 1];
>>>> a[j + 1] = x;
>>>> return j + 1;
>>>> }
>>>> void quickSort2(T, alias L = less!T)(T *a, int p, int r)
>>>> {
>>>> if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
>>>> if (p < r)
>>>> {
>>>> int q = partition2!(T, L)(a, p, r);
>>>> quickSort2!(T, L)(a, p, q - 1);
>>>> quickSort2!(T, L)(a, q + 1, r);
>>>> }
>>>> }
>>>> void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a,
>>>> 0,
>>>> length-1); }
>>>> double[] vals;
>>>> void bench1()
>>>> {
>>>> Array!double v;
>>>> for(int i = 0; i < 100; i++)
>>>> {
>>>> v = vals;
>>>> sort1(v);
>>>> }
>>>> }
>>>> void bench2()
>>>> {
>>>> Array!double v;
>>>> for(int i = 0; i < 100; i++)
>>>> {
>>>> v = vals;
>>>> sort2(&v[0], v.length);
>>>> }
>>>> }
>>>> void main()
>>>> {
>>>> Mt19937 gen;
>>>> vals.length = 1000;
>>>> for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
>>>> ulong[] times = [0, 0];
>>>> for(int i = 0; i < 100; i++)
>>>> {
>>>> auto times2 = benchmark!(bench1, bench2)(1);
>>>> times[0] += times2[0];
>>>> times[1] += times2[1];
>>>> }
>>>> writeln("Sorting with Array.opIndex: ", times[0]);
>>>> writeln("Sorting with pointers: ", times[1]);
>>>> writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
>>>> }
>>>
>>> Testing on GDC with a Netbook, results from 3 consecutive runs are:
>>>
>>> Without -frelease
>>> -------
>>> Sorting with Array.opIndex: 27981
>>> Sorting with pointers: 5602
>>> 79.9793 percent faster
>>>
>>> Sorting with Array.opIndex: 25565
>>> Sorting with pointers: 5179
>>> 79.7418 percent faster
>>>
>>> Sorting with Array.opIndex: 28657
>>> Sorting with pointers: 5772
>>> 79.8583 percent faster
>>> -------
>>>
>>>
>>> With -frelease
>>> -------
>>> Sorting with Array.opIndex: 10591
>>> Sorting with pointers: 4771
>>> 54.9523 percent faster
>>>
>>> Sorting with Array.opIndex: 10289
>>> Sorting with pointers: 4710
>>> 54.223 percent faster
>>>
>>> Sorting with Array.opIndex: 11305
>>> Sorting with pointers: 5216
>>> 53.8611 percent faster
>>> -------
>>>
>>>
>>> With -frelease -fno-bounds-check
>>> -------
>>> Sorting with Array.opIndex: 11651
>>> Sorting with pointers: 5236
>>> 55.0597 percent faster
>>>
>>> Sorting with Array.opIndex: 9873
>>> Sorting with pointers: 4559
>>> 53.8236 percent faster
>>>
>>> Sorting with Array.opIndex: 10361
>>> Sorting with pointers: 4745
>>> 54.2033 percent faster
>>> -------
>>>
>>>
>>> GDC doesn't use DMD's FE inliner, but results from the GCC backend's
>>> inliner:
>>> -------
>>> Considering inline candidate check.
>>> Inlining check into fillUp.
>>> Merging blocks 9 and 10
>>> Merging blocks 9 and 11
>>>
>>> Considering inline candidate initialize.
>>> Inlining initialize into emplace.
>>> Merging blocks 2 and 3
>>> Merging blocks 2 and 4
>>>
>>> Considering inline candidate bench2.
>>> Not inlining: code size would grow by 77 insns.
>>> Considering inline candidate bench1.
>>> Not inlining: code size would grow by 49 insns.
>>> -------
>>>
>>>
>>> So there's _me_ seriously doubting that inlining has anything to do with
>>> the
>>> 50% increase.
>>>
>>> Regards
>>> Iain
>>>
>
>
More information about the Digitalmars-d
mailing list