One usage of GNU C
bearophile
bearophileHUGS at lycos.com
Wed Jul 7 10:04:27 PDT 2010
This post is about a recent situation where I have used C instead of D2. It's just a small example, feel free to ignore this post.
This is the reduced version of a small C program (I have written the same program in D2 too), the little foo() is a critical function for the performance of the whole program:
#define N 200
double m1[N][N];
double m2[N][N];
double foo(unsigned char* used, int current) {
double sum = 0.0;
int i;
for (i = 0; i < N; i++)
if (!used[i])
sum += m2[current][i] * (1.0 + m1[current][i]);
return sum;
}
int main() {
return 0;
}
Where about half of used[] bytes are zero.
This is the 32bit asm of the foo() loop, generated by GCC 4.5 with:
-O3 -fomit-frame-pointer -mfpmath=sse -msse2 -ffast-math
L3:
cmpb $0, (%ecx,%eax)
jne L2
movsd _m1(%edx,%eax,8), %xmm0
addsd %xmm2, %xmm0
mulsd _m2(%edx,%eax,8), %xmm0
addsd %xmm0, %xmm1
L2:
addl $1, %eax
cmpl $200, %eax
jne L3
You can speed up that code in D unrolling the loop once and using two sum, but the performance gain is not much. In D you can also rewrite that loop in assembly, but this loses the possibility to inline foo() (and in this program the inlining of foo() is important, I think dmd isn't doing it anyway), and I need too much time to write that in asm, it's not immediate at all.
In C + GCC extensions it's almost easy to use XMM registers:
#define N 200
#if (N % 2)
#define N_NEXT_EVEN (N+1)
#else
#define N_NEXT_EVEN N
#endif
double m1[N][N_NEXT_EVEN];
double m2[N][N_NEXT_EVEN];
typedef double double2 __attribute__ ((vector_size(16)));
#define first(pair) (((double*)&pair)[0])
#define second(pair) (((double*)&pair)[1])
double foo2(unsigned char used[N], int current) {
double2 *m2_current2 = (double2*)&(m2[current]);
double2 *m1_current2 = (double2*)&(m1[current]);
double2 one = { 1.0, 1.0 };
double sum = 0.0;
int ipair;
for (ipair = 0; ipair < (N / 2); ipair++)
if ((!used[2 * ipair]) || (!used[2 * ipair + 1])) {
double2 adder = m2_current2[ipair] * (one + m1_current2[ipair]);
if (!used[2 * ipair])
sum += first(adder);
if (!used[2 * ipair + 1])
sum += second(adder);
}
if ((N & 1) && (!used[N-1]))
sum += m2[current][N-1] * (1.0 + m1[current][N-1]);
return sum;
}
int main() {
if ((unsigned int)&m2[0][0] % 16 || (unsigned int)&m1[0][0] % 16)
return 1;
return 0;
}
This C code is not very clean, but it's far better than writing the same thing in asm (there are many other ways to write that code, but this seems efficient).
The 32bit asm of the foo2() loop:
L11:
cmpb $0, 1(%edx)
jne L4
movapd (%ecx,%eax), %xmm0
addpd %xmm4, %xmm0
mulpd (%ebx,%eax), %xmm0
L6:
unpckhpd %xmm0, %xmm0
addsd %xmm0, %xmm1
L4:
addl $16, %eax
addl $2, %edx
cmpl $1600, %eax
je L10
L5:
cmpb $0, (%edx)
jne L11
cmpb $0, 1(%edx)
movapd (%ecx,%eax), %xmm0
addpd %xmm3, %xmm0
mulpd (%ebx,%eax), %xmm0
movapd %xmm0, %xmm2
addsd %xmm2, %xmm1
je L6
addl $16, %eax
addl $2, %edx
cmpl $1600, %eax
jne L5
Writing this by hand, plus a bit of code to set up the loop, can require almost a day or so to me, because I'm slow and not expert in asm yet, while writing that C code has required a short enough time and I've created only one bug along the way.
D language is cuter and more handy because instead of writing something like:
#if (N % 2)
#define N_NEXT_EVEN (N+1)
#else
#define N_NEXT_EVEN N
#endif
I can write something nicer as:
static if (N % 2)
enum int N_NEXT_EVEN = N + 1;
else
enum int N_NEXT_EVEN = N;
Or probably even:
enum int N_NEXT_EVEN = N % 2 ? (N + 1) : N;
But for this program C+GCC give more power, so here C is better despite it looks a little worse.
Eventually I'd like to be able to write code like this in D2 (untested):
// this whole function will need to be inlined
double foo3(ubyte[N] used, int current) {
auto m1_current2 = cast(double[2][])m1[current];
auto m2_current2 = cast(double[2][])m2[current];
double[2] one = [0, 0]; // no heap allocations here
double sum = 0.0;
foreach (ipair; 0 .. N / 2)
if (!used[2 * ipair] || !used[2 * ipair + 1]) {
double[2][] adder = m2_current2[ipair][] * (one[] + m1_current2[ipair][]);
if (!used[2 * ipair])
sum += adder[0];
if (!used[2 * ipair + 1])
sum += adder[1];
}
if ((N & 1) && (!used[N-1]))
sum += m2[current][$-1] * (1.0 + m1[current][$-1]);
return sum;
}
In theory you can already write code like that in D, but the mapping to single SSE2 instructions is not present yet (I have an enhancement request on this for LLVM: http://llvm.org/bugs/show_bug.cgi?id=6956 ).
Bye,
bearophile
More information about the Digitalmars-d
mailing list