One usage of GNU C

Wed Jul 7 10:04:27 PDT 2010

This post is about a recent situation where I have used C instead of D2. It's just a small example, feel free to ignore this post.

This is the reduced version of a small C program (I have written the same program in D2 too), the little foo() is a critical function for the performance of the whole program:

#define N 200

double m1[N][N];
double m2[N][N];

double foo(unsigned char* used, int current) {
    double sum = 0.0;
    int i;
    for (i = 0; i < N; i++)
        if (!used[i])
            sum += m2[current][i] * (1.0 + m1[current][i]);
    return sum;
}

int main() {
    return 0;
}

Where about half of used[] bytes are zero.

This is the 32bit asm of the foo() loop, generated by GCC 4.5 with:
-O3 -fomit-frame-pointer -mfpmath=sse -msse2 -ffast-math

L3:
    cmpb    $0, (%ecx,%eax)
    jne L2
    movsd   _m1(%edx,%eax,8), %xmm0
    addsd   %xmm2, %xmm0
    mulsd   _m2(%edx,%eax,8), %xmm0
    addsd   %xmm0, %xmm1
L2:
    addl    $1, %eax
    cmpl    $200, %eax
    jne L3

You can speed up that code in D unrolling the loop once and using two sum, but the performance gain is not much. In D you can also rewrite that loop in assembly, but this loses the possibility to inline foo() (and in this program the inlining of foo() is important, I think dmd isn't doing it anyway), and I need too much time to write that in asm, it's not immediate at all.

In C + GCC extensions it's almost easy to use XMM registers:

#define N 200

#if (N % 2)
#define N_NEXT_EVEN (N+1)
#else
#define N_NEXT_EVEN N
#endif

double m1[N][N_NEXT_EVEN];
double m2[N][N_NEXT_EVEN];

typedef double double2 __attribute__ ((vector_size(16)));
#define first(pair) (((double*)&pair)[0])
#define second(pair) (((double*)&pair)[1])

double foo2(unsigned char used[N], int current) {
    double2 *m2_current2 = (double2*)&(m2[current]);
    double2 *m1_current2 = (double2*)&(m1[current]);
    double2 one = { 1.0, 1.0 };
    double sum = 0.0;

    int ipair;
    for (ipair = 0; ipair < (N / 2); ipair++)
        if ((!used[2 * ipair]) || (!used[2 * ipair + 1])) {
            double2 adder = m2_current2[ipair] * (one + m1_current2[ipair]);

            if (!used[2 * ipair])
                sum += first(adder);
            if (!used[2 * ipair + 1])
                sum += second(adder);
        }

    if ((N & 1) && (!used[N-1]))
        sum += m2[current][N-1] * (1.0 + m1[current][N-1]);

    return sum;
}

int main() {
    if ((unsigned int)&m2[0][0] % 16 || (unsigned int)&m1[0][0] % 16)
        return 1;
    return 0;
}

This C code is not very clean, but it's far better than writing the same thing in asm (there are many other ways to write that code, but this seems efficient).

The 32bit asm of the foo2() loop:

L11:
    cmpb    $0, 1(%edx)
    jne L4
    movapd  (%ecx,%eax), %xmm0
    addpd   %xmm4, %xmm0
    mulpd   (%ebx,%eax), %xmm0
L6:
    unpckhpd    %xmm0, %xmm0
    addsd   %xmm0, %xmm1
L4:
    addl    $16, %eax
    addl    $2, %edx
    cmpl    $1600, %eax
    je  L10
L5:
    cmpb    $0, (%edx)
    jne L11
    cmpb    $0, 1(%edx)
    movapd  (%ecx,%eax), %xmm0
    addpd   %xmm3, %xmm0
    mulpd   (%ebx,%eax), %xmm0
    movapd  %xmm0, %xmm2
    addsd   %xmm2, %xmm1
    je  L6
    addl    $16, %eax
    addl    $2, %edx
    cmpl    $1600, %eax
    jne L5

Writing this by hand, plus a bit of code to set up the loop, can require almost a day or so to me, because I'm slow and not expert in asm yet, while writing that C code has required a short enough time and I've created only one bug along the way.

D language is cuter and more handy because instead of writing something like:

#if (N % 2)
#define N_NEXT_EVEN (N+1)
#else
#define N_NEXT_EVEN N
#endif

I can write something nicer as:

static if (N % 2)
    enum int N_NEXT_EVEN = N + 1;
else
    enum int N_NEXT_EVEN = N;

Or probably even:

enum int N_NEXT_EVEN = N % 2 ? (N + 1) : N;

But for this program C+GCC give more power, so here C is better despite it looks a little worse.

Eventually I'd like to be able to write code like this in D2 (untested):

// this whole function will need to be inlined
double foo3(ubyte[N] used, int current) {
    auto m1_current2 = cast(double[2][])m1[current];
    auto m2_current2 = cast(double[2][])m2[current];
    double[2] one = [0, 0]; // no heap allocations here
    double sum = 0.0;

    foreach (ipair; 0 .. N / 2)
        if (!used[2 * ipair] || !used[2 * ipair + 1]) {
            double[2][] adder = m2_current2[ipair][] * (one[] + m1_current2[ipair][]);

            if (!used[2 * ipair])
                sum += adder[0];
            if (!used[2 * ipair + 1])
                sum += adder[1];
        }

    if ((N & 1) && (!used[N-1]))
        sum += m2[current][$-1] * (1.0 + m1[current][$-1]);

    return sum;
}

In theory you can already write code like that in D, but the mapping to single SSE2 instructions is not present yet (I have an enhancement request on this for LLVM: http://llvm.org/bugs/show_bug.cgi?id=6956 ).

Bye,
bearophile