iasm, unexpectedly slower than DMD production

Sun Sep 11 17:46:16 PDT 2016

I have this function, written in iasm:

°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°
T foo(T)(T x, T c)
if (is(T == float) || is(T == double))
{

     version(none)
     {
         return x*x*x - x*x*c + x*c;
     }
     else asm
     {
         naked;
         movsd   XMM3, XMM1;
         mulsd   XMM0, XMM1;
         mulsd   XMM1, XMM1;
         movsd   XMM2, XMM1;
         mulsd   XMM1, XMM3;
         addsd   XMM1, XMM0;
         mulsd   XMM0, XMM3;
         subsd   XMM1, XMM0;
         movsd   XMM0, XMM1;
         ret;
     }
}

void main()
{
     // compile with -O -release
     import std.datetime, std.stdio;
     benchmark!({auto a = foo(0.2,0.2);})(1_000)[0].writeln;
}
°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°

The DMD production for the non-iasm version is

;------- SUB 00000000004A7BD8h -------
00000000004A7BD8h  sub rsp, 18h
00000000004A7BDCh  movsd xmm5, xmm0
00000000004A7BE0h  movsd xmm4, xmm1
00000000004A7BE4h  movsd qword ptr [rsp], xmm1
00000000004A7BE9h  movsd xmm0, qword ptr [rsp]
00000000004A7BEEh  mulsd xmm0, xmm5
00000000004A7BF2h  mulsd xmm1, xmm4
00000000004A7BF6h  mulsd xmm1, xmm4
00000000004A7BFAh  movsd xmm2, xmm4
00000000004A7BFFh  mulsd xmm2, xmm4
00000000004A7C03h  mulsd xmm2, xmm5
00000000004A7C07h  subsd xmm1, xmm2
00000000004A7C0Bh  addsd xmm0, xmm1
00000000004A7C0Fh  add rsp, 18h
00000000004A7C13h  ret
;-------------------------------------

When I change the version(none) to version(all), the benchmark is 
**7X** faster (e.g 410 against 3000 for the iasm version).

This difference doesn't look normal at all.
Does anyone know why ? The usage of the stack to move xmm1 in 
xmm0 is particularly strange...