Inlining Code Test
Iain Buclaw
ibuclaw at ubuntu.com
Mon Dec 13 04:29:34 PST 2010
== Quote from Craig Black (craigblack2 at cox.net)'s article
> If the problem is not inlining, then why does the same code in C++ does not
> suffer from this performance limitation (when compiled with Visual C++
> 2008)?
> #include <stdio.h>
> #include <assert.h>
> #include <stdlib.h>
> template <class T>
> T min(T a, T b) { return a < b ? a : b; }
> template <class T>
> T max(T a, T b) { return a > b ? a : b; }
> inline void *operator new(size_t, void *ptr) { return ptr; }
> inline void operator delete(void *, void *) {}
> template <class T>
> class Array
> {
> public:
> Array() {}
> Array(int length) { resize(length); }
> Array(const Array<T> &a) { append(a); }
> void clear() { base.clear(); }
> void resize(int sz)
> {
> assert(sz >= 0);
> if(sz == 0) return clear();
> if(!base.array) reserve(sz, sz);
> *base.lengthPtr() = sz;
> }
> void reserve(int capacity, int length)
> {
> assert(length <= capacity);
> if(capacity == 0) return clear();
> ArrayBase old;
> if(base.array)
> {
> if(base.capacity() >= capacity) return;
> old.array = base.array;
> base.array = 0;
> }
> base.array = (char*)(new char[capacity*sizeof(T)+8]);
> *base.lengthPtr() = length;
> *base.capacityPtr() = capacity;
> for(int i = 0; i < capacity; i++) new(base.data()+i)T;
> if(old.array) copyData(old);
> }
> int length() const { return base.length(); }
> int capacity() const { return base.capacity(); }
> bool empty() const { return base.array == 0; }
> T &at(int i)
> {
> assert(!empty());
> assert(i >= 0 && i < length());
> return base.data()[i];
> }
> const T &at(int i) const
> {
> assert(!empty());
> assert(i >= 0 && i < length());
> return base.data()[i];
> }
> T &operator [](int i) { return at(i); }
> const T &operator [](int i) const { return at(i); }
> void operator = (const Array &array)
> {
> copy(array);
> }
> void copy(const Array &array)
> {
> if(array.empty()) return clear();
> int len = array.length();
> reserve(len, len);
> for(int i = 0; i < len; i++) at(i) = array[i];
> }
> T &front() { return at(0); }
> const T &front() const { return at(0); }
> T &back() { return at(length()-1); }
> const T &back() const { return at(length()-1); }
> T &append()
> {
> int sz = length();
> int sp = capacity();
> if(sp > sz) (*base.lengthPtr())++;
> else
> {
> sz++;
> sp = max(max(2, sp+sp/2), sz);
> reserve(sp, sz);
> }
> return back();
> }
> void append(const Array<T> &a)
> {
> int prevLen = length();
> resize(prevLen + a.length());
> for(int i = 0; i < a.length(); i++) at(i+prevLen) = a[i];
> }
> private:
> class ArrayBase
> {
> public:
> ArrayBase() { array = 0; }
> ~ArrayBase() { clear(); }
> void clear() { if(array) delete array; }
> int length() const { return array ? *lengthPtr() : 0; }
> int capacity() const { return array ? *capacityPtr() : 0; }
> int* lengthPtr() const { return (int*)array; }
> int* capacityPtr() const { return (int*)(array+4); }
> T* data() const { return (T*)(array+8); }
> char* array;
> };
> ArrayBase base;
> void copyData(ArrayBase &array)
> {
> int copyLen = min(length(), array.length());
> for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
> }
> };
> template <class T>
> struct Less
> {
> bool operator() (T a, T b) { return a < b; }
> };
> template <class T>
> void swap(T &a, T &b)
> {
> T temp = a;
> a = b;
> b = temp;
> }
> template <class T, class L>
> void insertionSort1(Array<T> &a, int low, int high, L less)
> {
> for(int i = low; i <= high; i++)
> {
> int min = i;
> for(int j = i + 1; j <= high; j++)
> if(less(a[j], a[min])) min = j;
> swap(a[i], a[min]);
> }
> }
> template<class T, class L>
> int partition1(Array<T> &a, int p, int r, L less)
> {
> T x = a[r];
> int j = p - 1;
> for (int i = p; i < r; i++)
> {
> if (less(x, a[i])) continue;
> swap(a[i], a[++j]);
> }
> a[r] = a[j + 1];
> a[j + 1] = x;
> return j + 1;
> }
> template<class T, class L>
> void quickSort1(Array<T> &a, int p, int r, L less)
> {
> if(p + 7 > r) return insertionSort1(a, p, r, less);
> if (p < r)
> {
> int q = partition1(a, p, r, less);
> quickSort1(a, p, q - 1, less);
> quickSort1(a, q + 1, r, less);
> }
> }
> template <class T, class L>
> void sort1(Array<T> &a) { quickSort1(a, 0, a.length()-1, Less<T>()); }
> template <class T, class L>
> void insertionSort2(T *a, int low, int high, L less)
> {
> for(int i = low; i <= high; i++)
> {
> int min = i;
> for(int j = i + 1; j <= high; j++)
> if(less(a[j], a[min])) min = j;
> swap(a[i], a[min]);
> }
> }
> template<class T, class L>
> int partition2(T *a, int p, int r, L less)
> {
> T x = a[r];
> int j = p - 1;
> for (int i = p; i < r; i++)
> {
> if (less(x, a[i])) continue;
> swap(a[i], a[++j]);
> }
> a[r] = a[j + 1];
> a[j + 1] = x;
> return j + 1;
> }
> template<class T, class L>
> void quickSort2(T *a, int p, int r, L less)
> {
> if(p + 7 > r) return insertionSort2(a, p, r, less);
> if (p < r)
> {
> int q = partition2(a, p, r, less);
> quickSort2(a, p, q - 1, less);
> quickSort2(a, q + 1, r, less);
> }
> }
> template <class T, class L>
> void sort2(Array<T> &a) { quickSort2(&a[0], 0, a.length()-1, Less<T>()); }
> typedef unsigned long long uint64;
> uint64 getCycle() {
> __asm { RDTSC }
> }
> Array<double> vals;
> uint64 bench1()
> {
> uint64 startTime = getCycle();
> Array<double> v;
> for(int i = 0; i < 100; i++)
> {
> v = vals;
> sort1<double, Less<double> >(v);
> }
> return getCycle()-startTime;
> }
> uint64 bench2()
> {
> uint64 startTime = getCycle();
> Array<double> v;
> for(int i = 0; i < 100; i++)
> {
> v = vals;
> sort2<double, Less<double> >(v);
> }
> return getCycle()-startTime;
> }
> void main()
> {
> vals.resize(1000);
> for(int i = 0; i < 1000; i++) vals[i] = (rand() % 1000000) / 1000.0;
> uint64 time1 = 0;
> uint64 time2 = 0;
> for(int i = 0; i < 100; i++)
> {
> time1 += bench1();
> time2 += bench2();
> }
> printf("Sorting with Array: %f\n", time1/100000.0);
> printf("Sorting with pointers: %f\n", time2/100000.0);
> printf("%f percent faster\n", 100.0 * (time1-time2) / time1);
> }
D can use inline asm in much the same way, but that's deterring from your point. =)
import std.c.stdio;
import std.random;
import std.conv;
import std.algorithm;
import std.range;
struct Array(T)
{
public:
this(int length) { resize(length); }
this(T[] a) { append(a); }
this(this)
{
if(!base.array) return;
ArrayBase!T old;
old = base;
base.array = null;
reserve(old.length(), old.length());
copyData(old);
old.array = null;
}
void clear() { base.clear(); }
void resize(int sz)
{
assert(sz >= 0);
if(sz == 0) return clear();
if(!base.array) reserve(sz, sz);
*base.lengthPtr() = sz;
}
void reserve(int capacity, int length)
{
assert(length <= capacity);
if(capacity == 0) return clear();
ArrayBase!T old;
if(base.array)
{
if(base.capacity() >= capacity) return;
old.array = base.array;
base.array = null;
}
base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
*base.lengthPtr() = length;
*base.capacityPtr() = capacity;
for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
if(old.array) copyData(old);
}
int length() const { return base.length(); }
int capacity() const { return base.capacity(); }
bool empty() const { return base.array == null; }
ref T at(int i)
{
assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array");
assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-
1));
return base.data()[i];
}
ref const T at(int i) const
{
assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array");
assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-
1));
return base.data()[i];
}
const ref T opIndex(int i) const { return at(i); }
void opIndexAssign(T t, int i) { at(i) = t; }
void opIndexAssign(ref T t, int i) { at(i) = t; }
void opAssign(ref const Array!T array)
{
copy(array);
}
void opAssign(T[] array)
{
int len = array.length;
resize(len);
for(int i = 0; i < len; i++) at(i) = array[i];
}
void copy(ref const Array!T array)
{
if(array.empty()) return clear();
int len = array.length();
reserve(len, len);
for(int i = 0; i < len; i++) at(i) = array[i];
}
void opOpAssign(string op, A...)(A a) if(op == "~")
{
appendComposite(a);
}
ref T front() { return at(0); }
const ref T front() const { return at(0); }
ref T back() { return at(length()-1); }
const ref T back() const { return at(length()-1); }
ref T appendOne()
{
int sz = length();
int sp = capacity();
if(sp > sz) (*base.lengthPtr())++;
else
{
sz++;
sp = max(2, sp+sp/2, sz);
reserve(sp, sz);
}
return back();
}
void append(A...)(A a)
{
static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || is(typeof(a[0]) == T[])))
{
appendComposite(a[0]);
} else {
appendTuple(a);
}
}
void appendTuple(A...)(A a)
{
foreach(x; a) appendOne() = x;
}
void appendComposite(A)(A a)
{
int prevLen = length();
resize(prevLen + a.length);
for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
}
private:
static struct ArrayBase(T)
{
public:
~this() { clear(); }
void clear() { if(array) delete array; }
int length() const { return array ? *lengthPtr() : 0; }
int capacity() const { return array ? *capacityPtr() : 0; }
int* lengthPtr() const { return cast(int*)(array); }
int* capacityPtr() const { return cast(int*)(array+4); }
T* data() const { return cast(T*)(array+8); }
ubyte* array;
}
ArrayBase!T base;
void copyData(ref ArrayBase!T array)
{
int copyLen = min(length, array.length());
for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
}
}
static bool less(T)(T a, T b) { return a < b; }
void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high)
{
for(int i = low; i <= high; i++)
{
int min = i;
for(int j = i + 1; j <= high; j++)
if(L(a[j], a[min])) min = j;
swap(a[i], a[min]);
}
}
int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
{
T x = a[r];
int j = p - 1;
for (int i = p; i < r; i++)
{
if (L(x, a[i])) continue;
swap(a[i], a[++j]);
}
a[r] = a[j + 1];
a[j + 1] = x;
return j + 1;
}
void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
{
if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
if (p < r)
{
int q = partition1!(T, L)(a, p, r);
quickSort1!(T, L)(a, p, q - 1);
quickSort1!(T, L)(a, q + 1, r);
}
}
void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, a.length()-1); }
void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
{
for(int i = low; i <= high; i++)
{
int min = i;
for(int j = i + 1; j <= high; j++)
if(L(a[j], a[min])) min = j;
swap(a[i], a[min]);
}
}
int partition2(T, alias L = less!T)(T *a, int p, int r)
{
T x = a[r];
int j = p - 1;
for (int i = p; i < r; i++)
{
if (L(x, a[i])) continue;
swap(a[i], a[++j]);
}
a[r] = a[j + 1];
a[j + 1] = x;
return j + 1;
}
void quickSort2(T, alias L = less!T)(T *a, int p, int r)
{
if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
if (p < r)
{
int q = partition2!(T, L)(a, p, r);
quickSort2!(T, L)(a, p, q - 1);
quickSort2!(T, L)(a, q + 1, r);
}
}
void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0, length-1); }
ulong getCycle() {
asm { rdtsc; }
}
double[] vals;
ulong bench1()
{
ulong startTime = getCycle();
Array!double v;
for(int i = 0; i < 100; i++)
{
v = vals;
sort1(v);
}
return getCycle() - startTime;
}
ulong bench2()
{
ulong startTime = getCycle();
Array!double v;
for(int i = 0; i < 100; i++)
{
v = vals;
sort2(&v[0], v.length);
}
return getCycle() - startTime;
}
void main()
{
Mt19937 gen;
vals.length = 1000;
for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
ulong time1;
ulong time2;
for(int i = 0; i < 100; i++)
{
time1 += bench1();
time2 += bench2();
}
printf("Sorting with Array.opIndex: %f\n", time1/1e5);
printf("Sorting with pointers: %f\n", time2/1e5);
printf("%f percent faster\n", 100.0 * (time1-time2) / time1);
}
Testing your C++ program (altered getCycle() for GCC)
Times I get:
-------
Sorting with Array: 46869.159840
Sorting with pointers: 38688.937320
17.453316 percent faster
Sorting with Array: 46631.903760
Sorting with pointers: 38520.609360
17.394302 percent faster
Sorting with Array: 46674.330720
Sorting with pointers: 38545.202520
17.416700 percent faster
-------
On a , I thought I might try an older version of GCC for the D program.
Really surprisingly, I got:
-------
Sorting with Array.opIndex: 43075.059840
Sorting with pointers: 40019.701920
7.093102 percent faster
Sorting with Array.opIndex: 42940.085640
Sorting with pointers: 39594.089040
7.792245 percent faster
Sorting with Array.opIndex: 44389.127280
Sorting with pointers: 41159.016960
7.276805 percent faster
-------
This will need some thinking through as to just what happened between GDC-4.3 -> GDC-4.4 :~)
Regards
More information about the Digitalmars-d
mailing list