Low hanging fruit for optimizing loops

Fri Jun 7 21:15:22 PDT 2013

Given the recent surge in interest for performance, I dusted
off a small test that I made long ago and determined myself
to find the cause of the performance difference.

I tested the linear version of fibonacci both in DMD and
in C with GCC. Without optimization switches, I'm happy
to see that the D version is faster. But when using the
switches, the C version takes 30% less time.

I'm including the dissasembly here. The asm functions
are very very close to each other. Both loops are only
6 instructions long.

So I think it is a low hanging fruit because IMO the
speed difference is either because the GCC loop jump is
64bit aligned, or because of the order of the instructions
(there is an extra instruction between the loop increment
and the loop comparison, giving an opportunity for
parallelization).

OS:
    Kubuntu Linux 12.04  64bit

CPU:
    2.1GHz - AMD Athlon(tm) 64 X2 Dual Core Processor 4000+

Switches:
    dmd -O -inline -noboundscheck -release dtest.d
    gcc -O3 -o ctest ctest.c

Times:
    D:   1 sec, 430 ms, 207 μs, and 4 hnsecs
    C:   940 ms

D version:
---------

     import std.stdio;
     import std.datetime;

     int fibw(int n) { //Linear Fibonacci
         int a = 1;
         int b = 1;
         for (int i=2; i <= n; ++i) {
             int sum = a + b;
             a = b;
             b = sum;
         }
         return b;
     }

     void main() {
         auto start = Clock.currTime();
         int r = fibw(1000_000_000);
         auto elapsed = Clock.currTime() - start;
         writeln(r);
         writeln(elapsed);
     }

C Version:
---------

#include <stdio.h>
#include <time.h>

int fibw(int n) { //Linear Fibonacci
     int a = 1;
     int b = 1;
     int i;
     for (i=2; i <= n; ++i) {
         int sum = a + b;
         a = b;
         b = sum;
     }
     return b;
}

int main() {
     clock_t start = clock();
     int r = fibw(1000*1000*1000);
     clock_t elapsed = clock() - start;
     printf("%d\n", r);
     printf("%d ms\n", (int)(elapsed * 1000 / CLOCKS_PER_SEC));
     return 0;
}

D Version DISASM:
----------------
00000000004681d0 <_D5dtest4fibwFiZi>:
   4681d0:   55                 push   rbp
   4681d1:   48 8b ec           mov    rbp,rsp
   4681d4:   48 89 f9           mov    rcx,rdi
   4681d7:   be 01 00 00 00     mov    esi,0x1
   4681dc:   b8 01 00 00 00     mov    eax,0x1
   4681e1:   ba 02 00 00 00     mov    edx,0x2
   4681e6:   83 f9 02           cmp    ecx,0x2
   4681e9:   7c 0f              jl     4681fa 
<_D5dtest4fibwFiZi+0x2a>
                                ; LOOP JUMP --->
   4681eb:   8d 3c 06           lea    edi,[rsi+rax*1]
   4681ee:   48 89 c6           mov    rsi,rax
   4681f1:   48 89 f8           mov    rax,rdi
   4681f4:   ff c2              inc    edx
   4681f6:   39 ca              cmp    edx,ecx
   4681f8:   7e f1              jle    4681eb 
<_D5dtest4fibwFiZi+0x1b>
                                ; LOOP END
   4681fa:   5d                 pop    rbp
   4681fb:   c3                 ret

C Version DISASM:
----------------
0000000000400860 <fibw>:
   400860:   83 ff 01          cmp    edi,0x1
   400863:   b8 01 00 00 00    mov    eax,0x1
   400868:   7e 1c             jle    400886 <fibw+0x26>
   40086a:   ba 02 00 00 00    mov    edx,0x2
   40086f:   b9 01 00 00 00    mov    ecx,0x1
                               ; NOTICE THE nop (64bit 
alignment??):
   400874:   0f 1f 40 00       nop    DWORD PTR [rax+0x0]
                               ; LOOP JUMP -->
   400878:   8d 34 01          lea    esi,[rcx+rax*1]
   40087b:   83 c2 01          add    edx,0x1
   40087e:   89 c1             mov    ecx,eax    ; REORDERED cmp
   400880:   39 d7             cmp    edi,edx
   400882:   89 f0             mov    eax,esi
   400884:   7d f2             jge    400878 <fibw+0x18>
                               ; LOOP END
   400886:   f3 c3             repz ret
   400888:   90                nop
   400889:   90                nop

both files were dissasembled with:

        objdump -M intel -d

--jm