Loop optimization

Fri May 14 03:22:54 PDT 2010

kai:

> I was evaluating using D for some numerical stuff.

For that evaluation you probably have to use the LDC compiler, that is able to optimize better.

> 	void main (string[] args)
> 	{
> 		double [] foo = new double [cast(int)1e6];
> 		for (int i=0;i<1e3;i++)
> 		{
> 			for (int j=0;j<1e6-1;j++)
> 			{
> 				foo[j]=foo[j]+foo[j+1];
> 			}
> 		}
> 	}

Using floating point for indexes and lengths is not a good practice. In D large numbers are written like 1_000_000.
Use -release too.

> Any ideas? Am I somehow not hitting a vital compiler optimization?

DMD compiler doesn't perform many optimizations, especially on floating point computations.
But the bigger problem in your code is that you are performing operations on NaNs (that's the default initalization of FP values in D), and operations on NaNs are usually quite slower.

Your code in C:

#include "stdio.h"
#include "stdlib.h"
#define N 1000000

int main() {
    double *foo = calloc(N, sizeof(double)); // malloc suffices here
    int i, j;
    for (j = 0; j < N; j++)
        foo[j] = 1.0;

    for (i = 0; i < 1000; i++)
        for (j = 0; j < N-1; j++)
            foo[j] = foo[j] + foo[j + 1];

    printf("%f", foo[N-1]);
    return 0;
}

/*
gcc -O3 -s -Wall test.c -o test
Timings, outer loop=1_000 times: 7.72

------------------

gcc -Wall -O3 -fomit-frame-pointer -msse3 -march=native test.c -o test
(Running on a VirtualBox)
Timings, outer loop=1_000 times: 7.69 s
Just the inner loop:
.L7:
    fldl    8(%edx)
    fadd    %st, %st(1)
    fxch    %st(1)
    fstpl   (%edx)
    addl    $8, %edx
    cmpl    %ecx, %edx
    jne .L7
*/

--------------------

Your code in D1:

version (Tango)
    import tango.stdc.stdio: printf;
else
    import std.c.stdio: printf;

void main() {
    const int N = 1_000_000;
    double[] foo = new double[N];
    foo[] = 1.0;

    for (int i = 0; i < 1_000; i++)
        for (int j = 0; j < N-1; j++)
            foo[j] = foo[j] + foo[j + 1];

    printf("%f", foo[N-1]);
}

/*
dmd -O -release -inline test.d
(Not running on a VirtualBox)
Timings, outer loop=1_000 times: 9.35 s
Just the inner loop:
L34:    fld qword ptr 8[EDX*8][ECX]
        fadd    qword ptr [EDX*8][ECX]
        fstp    qword ptr [EDX*8][ECX]
        inc EDX
        cmp EDX,0F423Fh
        jb  L34

-----------------------

ldc -O3 -release -inline test.d
(Running on a VirtualBox)
Timings, outer loop=1_000 times: 7.87 s
Just the inner loop:
.LBB1_2:
    movsd   (%eax,%ecx,8), %xmm0
    addsd   8(%eax,%ecx,8), %xmm0
    movsd   %xmm0, (%eax,%ecx,8)
    incl    %ecx
    cmpl    $999999, %ecx
    jne .LBB1_2

-----------------------

ldc -unroll-allow-partial -O3 -release -inline test.d
(Running on a VirtualBox)
Timings, outer loop=1_000 times: 7.75 s
Just the inner loop:
.LBB1_2:
    movsd   (%eax,%ecx,8), %xmm0
    addsd   8(%eax,%ecx,8), %xmm0
    movsd   %xmm0, (%eax,%ecx,8)
    movsd   8(%eax,%ecx,8), %xmm0
    addsd   16(%eax,%ecx,8), %xmm0
    movsd   %xmm0, 8(%eax,%ecx,8)
    movsd   16(%eax,%ecx,8), %xmm0
    addsd   24(%eax,%ecx,8), %xmm0
    movsd   %xmm0, 16(%eax,%ecx,8)
    movsd   24(%eax,%ecx,8), %xmm0
    addsd   32(%eax,%ecx,8), %xmm0
    movsd   %xmm0, 24(%eax,%ecx,8)
    movsd   32(%eax,%ecx,8), %xmm0
    addsd   40(%eax,%ecx,8), %xmm0
    movsd   %xmm0, 32(%eax,%ecx,8)
    movsd   40(%eax,%ecx,8), %xmm0
    addsd   48(%eax,%ecx,8), %xmm0
    movsd   %xmm0, 40(%eax,%ecx,8)
    movsd   48(%eax,%ecx,8), %xmm0
    addsd   56(%eax,%ecx,8), %xmm0
    movsd   %xmm0, 48(%eax,%ecx,8)
    movsd   56(%eax,%ecx,8), %xmm0
    addsd   64(%eax,%ecx,8), %xmm0
    movsd   %xmm0, 56(%eax,%ecx,8)
    movsd   64(%eax,%ecx,8), %xmm0
    addsd   72(%eax,%ecx,8), %xmm0
    movsd   %xmm0, 64(%eax,%ecx,8)
    addl    $9, %ecx
    cmpl    $999999, %ecx
    jne .LBB1_2
*/

As you see the code generated by ldc is about as good as the one generated by gcc. There are of course other ways to optimize this code...

Bye,
bearophile