Loop optimization
bearophile
bearophileHUGS at lycos.com
Fri May 14 03:22:54 PDT 2010
kai:
> I was evaluating using D for some numerical stuff.
For that evaluation you probably have to use the LDC compiler, that is able to optimize better.
> void main (string[] args)
> {
> double [] foo = new double [cast(int)1e6];
> for (int i=0;i<1e3;i++)
> {
> for (int j=0;j<1e6-1;j++)
> {
> foo[j]=foo[j]+foo[j+1];
> }
> }
> }
Using floating point for indexes and lengths is not a good practice. In D large numbers are written like 1_000_000.
Use -release too.
> Any ideas? Am I somehow not hitting a vital compiler optimization?
DMD compiler doesn't perform many optimizations, especially on floating point computations.
But the bigger problem in your code is that you are performing operations on NaNs (that's the default initalization of FP values in D), and operations on NaNs are usually quite slower.
Your code in C:
#include "stdio.h"
#include "stdlib.h"
#define N 1000000
int main() {
double *foo = calloc(N, sizeof(double)); // malloc suffices here
int i, j;
for (j = 0; j < N; j++)
foo[j] = 1.0;
for (i = 0; i < 1000; i++)
for (j = 0; j < N-1; j++)
foo[j] = foo[j] + foo[j + 1];
printf("%f", foo[N-1]);
return 0;
}
/*
gcc -O3 -s -Wall test.c -o test
Timings, outer loop=1_000 times: 7.72
------------------
gcc -Wall -O3 -fomit-frame-pointer -msse3 -march=native test.c -o test
(Running on a VirtualBox)
Timings, outer loop=1_000 times: 7.69 s
Just the inner loop:
.L7:
fldl 8(%edx)
fadd %st, %st(1)
fxch %st(1)
fstpl (%edx)
addl $8, %edx
cmpl %ecx, %edx
jne .L7
*/
--------------------
Your code in D1:
version (Tango)
import tango.stdc.stdio: printf;
else
import std.c.stdio: printf;
void main() {
const int N = 1_000_000;
double[] foo = new double[N];
foo[] = 1.0;
for (int i = 0; i < 1_000; i++)
for (int j = 0; j < N-1; j++)
foo[j] = foo[j] + foo[j + 1];
printf("%f", foo[N-1]);
}
/*
dmd -O -release -inline test.d
(Not running on a VirtualBox)
Timings, outer loop=1_000 times: 9.35 s
Just the inner loop:
L34: fld qword ptr 8[EDX*8][ECX]
fadd qword ptr [EDX*8][ECX]
fstp qword ptr [EDX*8][ECX]
inc EDX
cmp EDX,0F423Fh
jb L34
-----------------------
ldc -O3 -release -inline test.d
(Running on a VirtualBox)
Timings, outer loop=1_000 times: 7.87 s
Just the inner loop:
.LBB1_2:
movsd (%eax,%ecx,8), %xmm0
addsd 8(%eax,%ecx,8), %xmm0
movsd %xmm0, (%eax,%ecx,8)
incl %ecx
cmpl $999999, %ecx
jne .LBB1_2
-----------------------
ldc -unroll-allow-partial -O3 -release -inline test.d
(Running on a VirtualBox)
Timings, outer loop=1_000 times: 7.75 s
Just the inner loop:
.LBB1_2:
movsd (%eax,%ecx,8), %xmm0
addsd 8(%eax,%ecx,8), %xmm0
movsd %xmm0, (%eax,%ecx,8)
movsd 8(%eax,%ecx,8), %xmm0
addsd 16(%eax,%ecx,8), %xmm0
movsd %xmm0, 8(%eax,%ecx,8)
movsd 16(%eax,%ecx,8), %xmm0
addsd 24(%eax,%ecx,8), %xmm0
movsd %xmm0, 16(%eax,%ecx,8)
movsd 24(%eax,%ecx,8), %xmm0
addsd 32(%eax,%ecx,8), %xmm0
movsd %xmm0, 24(%eax,%ecx,8)
movsd 32(%eax,%ecx,8), %xmm0
addsd 40(%eax,%ecx,8), %xmm0
movsd %xmm0, 32(%eax,%ecx,8)
movsd 40(%eax,%ecx,8), %xmm0
addsd 48(%eax,%ecx,8), %xmm0
movsd %xmm0, 40(%eax,%ecx,8)
movsd 48(%eax,%ecx,8), %xmm0
addsd 56(%eax,%ecx,8), %xmm0
movsd %xmm0, 48(%eax,%ecx,8)
movsd 56(%eax,%ecx,8), %xmm0
addsd 64(%eax,%ecx,8), %xmm0
movsd %xmm0, 56(%eax,%ecx,8)
movsd 64(%eax,%ecx,8), %xmm0
addsd 72(%eax,%ecx,8), %xmm0
movsd %xmm0, 64(%eax,%ecx,8)
addl $9, %ecx
cmpl $999999, %ecx
jne .LBB1_2
*/
As you see the code generated by ldc is about as good as the one generated by gcc. There are of course other ways to optimize this code...
Bye,
bearophile
More information about the Digitalmars-d-learn
mailing list