From a C++/JS benchmark

Wed Aug 3 18:25:18 PDT 2011

Trass3r:

> I'm afraid not. dmd's backend isn't good at floating point calculations.

Studying a bit the asm it's not hard to find the cause, because this benchmark is quite pure (synthetic, despite I think it comes from real-world code).

This is what G++ generates from the C++ code without intrinsics (the version that uses SIMD intrinsics has a similar look but it's shorter):

movl  (%eax), %edx
movss  4(%eax), %xmm0
movl  8(%eax), %ecx
leal  (%edx,%edx,2), %edx
sall  $4, %edx
addl  %ebx, %edx
testl  %ecx, %ecx
movss  12(%edx), %xmm1
movss  20(%edx), %xmm7
movss  (%edx), %xmm5
mulss  %xmm0, %xmm1
mulss  %xmm0, %xmm7
movss  4(%edx), %xmm6
movss  8(%edx), %xmm4
movss  %xmm1, (%esp)
mulss  %xmm0, %xmm5
movss  28(%edx), %xmm1
movss  %xmm7, 4(%esp)
mulss  %xmm0, %xmm6
movss  32(%edx), %xmm7
mulss  %xmm0, %xmm1
movss  16(%edx), %xmm3
mulss  %xmm0, %xmm7
movss  24(%edx), %xmm2
movss  %xmm1, 16(%esp)
mulss  %xmm0, %xmm4
movss  36(%edx), %xmm1
movss  %xmm7, 8(%esp)
mulss  %xmm0, %xmm3
movss  40(%edx), %xmm7
mulss  %xmm0, %xmm2
mulss  %xmm0, %xmm1
mulss  %xmm0, %xmm7
mulss  44(%edx), %xmm0
leal  12(%eax), %edx
movss  %xmm7, 12(%esp)
movss  %xmm0, 20(%esp)

This is what DMD generates for the same (or quite similar) piece of code:

movsd
mov  EAX,068h[ESP]
imul  EDX,EAX,030h
add  EDX,018h[ESP]
fld  float ptr [EDX]
fmul  float ptr 06Ch[ESP]
fstp  float ptr 038h[ESP]
fld  float ptr 4[EDX]
fmul  float ptr 06Ch[ESP]
fstp  float ptr 03Ch[ESP]
fld  float ptr 8[EDX]
fmul  float ptr 06Ch[ESP]
fstp  float ptr 040h[ESP]
fld  float ptr 0Ch[EDX]
fmul  float ptr 06Ch[ESP]
fstp  float ptr 044h[ESP]
fld  float ptr 010h[EDX]
fmul  float ptr 06Ch[ESP]
fstp  float ptr 048h[ESP]
fld  float ptr 014h[EDX]
fmul  float ptr 06Ch[ESP]
fstp  float ptr 04Ch[ESP]
fld  float ptr 018h[EDX]
fmul  float ptr 06Ch[ESP]
fstp  float ptr 050h[ESP]
fld  float ptr 01Ch[EDX]
mov  CL,070h[ESP]
xor  CL,1
fmul  float ptr 06Ch[ESP]
fstp  float ptr 054h[ESP]
fld  float ptr 020h[EDX]
fmul  float ptr 06Ch[ESP]
fstp  float ptr 058h[ESP]
fld  float ptr 024h[EDX]
fmul  float ptr 06Ch[ESP]
fstp  float ptr 05Ch[ESP]
fld  float ptr 028h[EDX]
fmul  float ptr 06Ch[ESP]
fstp  float ptr 060h[ESP]
fld  float ptr 02Ch[EDX]
fmul  float ptr 06Ch[ESP]
fstp  float ptr 064h[ESP]

I think DMD back-end already contains logic to use xmm registers as true registers (not as a floating point stack or temporary holes where to push and pull FP values), so I suspect it doesn't take too much work to modify it to emit FP asm with a single optimization: just keep the values inside registers. In my uninformed opinion all other FP optimizations are almost insignificant compared to this one :-)

Bye,
bearophile