Inlining Code Test

Iain Buclaw ibuclaw at ubuntu.com
Mon Dec 13 04:29:34 PST 2010


== Quote from Craig Black (craigblack2 at cox.net)'s article
> If the problem is not inlining, then why does the same code in C++ does not
> suffer from this performance limitation (when compiled with Visual C++
> 2008)?
> #include <stdio.h>
> #include <assert.h>
> #include <stdlib.h>
> template <class T>
> T min(T a, T b) { return a < b ? a : b; }
> template <class T>
> T max(T a, T b) { return a > b ? a : b; }
> inline void *operator new(size_t, void *ptr) { return ptr; }
> inline void operator delete(void *, void *) {}
> template <class T>
> class Array
> {
> public:
>   Array() {}
>   Array(int length) { resize(length); }
>   Array(const Array<T> &a) { append(a); }
>   void clear() { base.clear(); }
>   void resize(int sz)
>   {
>     assert(sz >= 0);
>     if(sz == 0) return clear();
>     if(!base.array) reserve(sz, sz);
>     *base.lengthPtr() = sz;
>   }
>   void reserve(int capacity, int length)
>   {
>     assert(length <= capacity);
>     if(capacity == 0) return clear();
>     ArrayBase old;
>     if(base.array)
>     {
>       if(base.capacity() >= capacity) return;
>       old.array = base.array;
>       base.array = 0;
>     }
>     base.array = (char*)(new char[capacity*sizeof(T)+8]);
>     *base.lengthPtr() = length;
>     *base.capacityPtr() = capacity;
>     for(int i = 0; i < capacity; i++) new(base.data()+i)T;
>     if(old.array) copyData(old);
>   }
>   int length() const { return base.length(); }
>   int capacity() const { return base.capacity(); }
>   bool empty() const { return base.array == 0; }
>   T &at(int i)
>   {
>    assert(!empty());
>     assert(i >= 0 && i < length());
>     return base.data()[i];
>   }
>   const T &at(int i) const
>   {
>     assert(!empty());
>     assert(i >= 0 && i < length());
>     return base.data()[i];
>   }
>   T &operator [](int i) { return at(i); }
>   const T &operator [](int i) const { return at(i); }
>   void operator = (const Array &array)
>   {
>     copy(array);
>   }
>   void copy(const Array &array)
>   {
>     if(array.empty()) return clear();
>     int len = array.length();
>     reserve(len, len);
>     for(int i = 0; i < len; i++) at(i) = array[i];
>   }
>   T &front() { return at(0); }
>   const T &front() const { return at(0); }
>   T &back() { return at(length()-1); }
>   const T &back() const { return at(length()-1); }
>   T &append()
>   {
>     int sz = length();
>     int sp = capacity();
>     if(sp > sz) (*base.lengthPtr())++;
>     else
>     {
>       sz++;
>       sp = max(max(2, sp+sp/2), sz);
>       reserve(sp, sz);
>     }
>     return back();
>   }
>   void append(const Array<T> &a)
>   {
>     int prevLen = length();
>     resize(prevLen + a.length());
>     for(int i = 0; i < a.length(); i++) at(i+prevLen) = a[i];
>   }
> private:
>   class ArrayBase
>   {
>   public:
>     ArrayBase() { array = 0; }
>     ~ArrayBase() { clear(); }
>     void clear() { if(array) delete array; }
>     int length() const { return array ? *lengthPtr() : 0; }
>     int capacity() const { return array ? *capacityPtr() : 0; }
>     int* lengthPtr() const { return (int*)array; }
>     int* capacityPtr() const { return (int*)(array+4); }
>     T* data() const { return (T*)(array+8); }
>     char* array;
>   };
>   ArrayBase base;
>   void copyData(ArrayBase &array)
>   {
>     int copyLen = min(length(), array.length());
>     for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
>   }
> };
> template <class T>
> struct Less
> {
>   bool operator() (T a, T b) { return a < b; }
> };
> template <class T>
> void swap(T &a, T &b)
> {
>       T temp = a;
>       a = b;
>       b = temp;
> }
> template <class T, class L>
> void insertionSort1(Array<T> &a, int low, int high, L less)
> {
>   for(int i = low; i <= high; i++)
>   {
>     int min = i;
>     for(int j = i + 1; j <= high; j++)
>       if(less(a[j], a[min])) min = j;
>     swap(a[i], a[min]);
>   }
> }
> template<class T, class L>
> int partition1(Array<T> &a, int p, int r, L less)
> {
>   T x = a[r];
>   int j = p - 1;
>   for (int i = p; i < r; i++)
>   {
>     if (less(x, a[i])) continue;
>     swap(a[i], a[++j]);
>   }
>   a[r] = a[j + 1];
>   a[j + 1] = x;
>   return j + 1;
> }
> template<class T, class L>
> void quickSort1(Array<T> &a, int p, int r, L less)
> {
>   if(p + 7 > r) return insertionSort1(a, p, r, less);
>   if (p < r)
>   {
>     int q = partition1(a, p, r, less);
>     quickSort1(a, p, q - 1, less);
>     quickSort1(a, q + 1, r, less);
>   }
> }
> template <class T, class L>
> void sort1(Array<T> &a) { quickSort1(a, 0, a.length()-1, Less<T>()); }
> template <class T, class L>
> void insertionSort2(T *a, int low, int high, L less)
> {
>   for(int i = low; i <= high; i++)
>   {
>     int min = i;
>     for(int j = i + 1; j <= high; j++)
>       if(less(a[j], a[min])) min = j;
>     swap(a[i], a[min]);
>   }
> }
> template<class T, class L>
> int partition2(T *a, int p, int r, L less)
> {
>   T x = a[r];
>   int j = p - 1;
>   for (int i = p; i < r; i++)
>   {
>     if (less(x, a[i])) continue;
>     swap(a[i], a[++j]);
>   }
>   a[r] = a[j + 1];
>   a[j + 1] = x;
>   return j + 1;
> }
> template<class T, class L>
> void quickSort2(T *a, int p, int r, L less)
> {
>   if(p + 7 > r) return insertionSort2(a, p, r, less);
>   if (p < r)
>   {
>     int q = partition2(a, p, r, less);
>     quickSort2(a, p, q - 1, less);
>     quickSort2(a, q + 1, r, less);
>   }
> }
> template <class T, class L>
> void sort2(Array<T> &a) { quickSort2(&a[0], 0, a.length()-1, Less<T>()); }
> typedef unsigned long long uint64;
> uint64 getCycle() {
>     __asm { RDTSC }
> }
> Array<double> vals;
> uint64 bench1()
> {
>       uint64 startTime = getCycle();
>   Array<double> v;
>   for(int i = 0; i < 100; i++)
>   {
>     v = vals;
>     sort1<double, Less<double> >(v);
>   }
>       return getCycle()-startTime;
> }
> uint64 bench2()
> {
>       uint64 startTime = getCycle();
>   Array<double> v;
>   for(int i = 0; i < 100; i++)
>   {
>     v = vals;
>     sort2<double, Less<double> >(v);
>   }
>       return getCycle()-startTime;
> }
> void main()
> {
>   vals.resize(1000);
>   for(int i = 0; i < 1000; i++) vals[i] = (rand() % 1000000) / 1000.0;
>   uint64 time1 = 0;
>   uint64 time2 = 0;
>   for(int i = 0; i < 100; i++)
>   {
>     time1 += bench1();
>     time2 += bench2();
>   }
>   printf("Sorting with Array: %f\n", time1/100000.0);
>   printf("Sorting with pointers: %f\n", time2/100000.0);
>   printf("%f percent faster\n", 100.0 * (time1-time2) / time1);
> }

D can use inline asm in much the same way, but that's deterring from your point. =)


import std.c.stdio;
import std.random;
import std.conv;
import std.algorithm;
import std.range;

struct Array(T)
{
public:

 this(int length) { resize(length); }
 this(T[] a) { append(a); }

 this(this)
 {
  if(!base.array) return;
  ArrayBase!T old;
  old = base;
  base.array = null;
  reserve(old.length(), old.length());
  copyData(old);
  old.array = null;
 }

 void clear() { base.clear(); }

 void resize(int sz)
 {
  assert(sz >= 0);
  if(sz == 0) return clear();
  if(!base.array) reserve(sz, sz);
  *base.lengthPtr() = sz;
 }

 void reserve(int capacity, int length)
 {
  assert(length <= capacity);
  if(capacity == 0) return clear();
  ArrayBase!T old;
  if(base.array)
  {
    if(base.capacity() >= capacity) return;
    old.array = base.array;
    base.array = null;
  }
  base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
  *base.lengthPtr() = length;
  *base.capacityPtr() = capacity;

  for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
  if(old.array) copyData(old);
 }

 int length() const { return base.length(); }
 int capacity() const { return base.capacity(); }

 bool empty() const { return base.array == null; }

 ref T at(int i)
 {
  assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array");
  assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-
1));
  return base.data()[i];
 }

 ref const T at(int i) const
 {
  assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array");
  assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-
1));
  return base.data()[i];
 }

 const ref T opIndex(int i) const { return at(i); }
 void opIndexAssign(T t, int i) { at(i) = t; }
 void opIndexAssign(ref T t, int i) { at(i) = t; }

 void opAssign(ref const Array!T array)
 {
  copy(array);
 }

 void opAssign(T[] array)
 {
  int len = array.length;
  resize(len);
  for(int i = 0; i < len; i++) at(i) = array[i];
 }

 void copy(ref const Array!T array)
 {
  if(array.empty()) return clear();
  int len = array.length();
  reserve(len, len);
  for(int i = 0; i < len; i++) at(i) = array[i];
 }

 void opOpAssign(string op, A...)(A a) if(op == "~")
 {
  appendComposite(a);
 }

 ref T front() { return at(0); }
 const ref T front() const { return at(0); }
 ref T back() { return at(length()-1); }
 const ref T back() const { return at(length()-1); }

 ref T appendOne()
 {
  int sz = length();
  int sp = capacity();
  if(sp > sz) (*base.lengthPtr())++;
  else
  {
    sz++;
    sp = max(2, sp+sp/2, sz);
    reserve(sp, sz);
  }
  return back();
 }

 void append(A...)(A a)
 {
  static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || is(typeof(a[0]) == T[])))
  {
    appendComposite(a[0]);
  } else {
    appendTuple(a);
  }
 }

 void appendTuple(A...)(A a)
 {
  foreach(x; a) appendOne() = x;
 }

 void appendComposite(A)(A a)
 {
  int prevLen = length();
  resize(prevLen + a.length);
  for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
 }

private:

    static struct ArrayBase(T)
    {
    public:

          ~this() { clear(); }
          void clear() { if(array) delete array; }

          int length() const { return array ? *lengthPtr() : 0; }
          int capacity() const { return array ? *capacityPtr() : 0; }
          int* lengthPtr() const { return cast(int*)(array); }
          int* capacityPtr() const { return cast(int*)(array+4); }
          T* data() const { return cast(T*)(array+8); }

          ubyte* array;
    }

 ArrayBase!T base;

 void copyData(ref ArrayBase!T array)
 {
  int copyLen = min(length, array.length());
  for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
 }
}

static bool less(T)(T a, T b) { return a < b; }

void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high)
{
 for(int i = low; i <= high; i++)
 {
  int min = i;
  for(int j = i + 1; j <= high; j++)
    if(L(a[j], a[min])) min = j;
  swap(a[i], a[min]);
 }
}

int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
{
    T x = a[r];
    int j = p - 1;
    for (int i = p; i < r; i++)
    {
          if (L(x, a[i])) continue;
          swap(a[i], a[++j]);
    }
    a[r] = a[j + 1];
    a[j + 1] = x;
    return j + 1;
}

void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
{
 if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
 if (p < r)
    {
          int q = partition1!(T, L)(a, p, r);
          quickSort1!(T, L)(a, p, q - 1);
          quickSort1!(T, L)(a, q + 1, r);
    }
}

void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, a.length()-1); }

void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
{
 for(int i = low; i <= high; i++)
 {
  int min = i;
  for(int j = i + 1; j <= high; j++)
    if(L(a[j], a[min])) min = j;
  swap(a[i], a[min]);
 }
}

int partition2(T, alias L = less!T)(T *a, int p, int r)
{
    T x = a[r];
    int j = p - 1;
    for (int i = p; i < r; i++)
    {
          if (L(x, a[i])) continue;
          swap(a[i], a[++j]);
    }
    a[r] = a[j + 1];
    a[j + 1] = x;
    return j + 1;
}

void quickSort2(T, alias L = less!T)(T *a, int p, int r)
{
 if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
 if (p < r)
    {
          int q = partition2!(T, L)(a, p, r);
          quickSort2!(T, L)(a, p, q - 1);
          quickSort2!(T, L)(a, q + 1, r);
    }
}

void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0, length-1); }

ulong getCycle() {
 asm { rdtsc; }
}

double[] vals;

ulong bench1()
{
    ulong startTime = getCycle();

 Array!double v;
 for(int i = 0; i < 100; i++)
 {
  v = vals;
  sort1(v);
 }
    return getCycle() - startTime;
}

ulong bench2()
{
    ulong startTime = getCycle();

 Array!double v;
 for(int i = 0; i < 100; i++)
 {
  v = vals;
  sort2(&v[0], v.length);
 }
    return getCycle() - startTime;
}

void main()
{
 Mt19937 gen;
 vals.length = 1000;
 for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);

 ulong time1;
 ulong time2;

 for(int i = 0; i < 100; i++)
 {
  time1 += bench1();
  time2 += bench2();
 }
 printf("Sorting with Array.opIndex: %f\n", time1/1e5);
 printf("Sorting with pointers: %f\n", time2/1e5);
 printf("%f percent faster\n", 100.0 * (time1-time2) / time1);
}


Testing your C++ program (altered getCycle() for GCC)

Times I get:
-------
Sorting with Array: 46869.159840
Sorting with pointers: 38688.937320
17.453316 percent faster

Sorting with Array: 46631.903760
Sorting with pointers: 38520.609360
17.394302 percent faster

Sorting with Array: 46674.330720
Sorting with pointers: 38545.202520
17.416700 percent faster
-------


On a , I thought I might try an older version of GCC for the D program.
Really surprisingly, I got:

-------
Sorting with Array.opIndex: 43075.059840
Sorting with pointers: 40019.701920
7.093102 percent faster

Sorting with Array.opIndex: 42940.085640
Sorting with pointers: 39594.089040
7.792245 percent faster

Sorting with Array.opIndex: 44389.127280
Sorting with pointers: 41159.016960
7.276805 percent faster
-------

This will need some thinking through as to just what happened between GDC-4.3 -> GDC-4.4 :~)

Regards


More information about the Digitalmars-d mailing list