DMD floating point performance.

Sun Nov 12 14:57:12 PST 2006

Walter Bright wrote:
> Dave wrote:
>> If you look at the DMD asm, the problem is that each operation is 
>> wrapped by a load/store.. Why wouldn't val and sum be kept in fp 
>> registers inside the loop?
> 
> The issue isn't with D, it's with the back end code generator. You'll 
> see the same thing with the C and C++ compiler.
> 
> Because the FPU is a 'stack' machine rather than a register machine, 
> it's hard to write a code generator that enregisters floating point 

I know this is simplistic, but could something like fxch be used to 'mimick' a register machine as 
vars are stored into registers? fxch is supposed to be very efficient.

> variables. It's also problematical because every function call must 
> empty the FPU stack anyway, and so a lot of register spill logic must be 
> in place for it to be very useful.

Could ffree greatly simply things here?

> Not impossible, just a lot of tricky work. How does GDC do with this?

I don't know..

Here's some code showing relative asm. output:

dmd = 1: 0.671 secs
dmd asm = 1: 0.681 secs
opt asm = 1: 0.274 secs
gdc asm = 1: 0.279 secs

;---

import std.stdio, std.date;

void main()
{
     d_time s = getUTCtime;
     double sum = fp;
     d_time e = getUTCtime;
     writefln("dmd = ",sum,": ",(e-s)/cast(real)TicksPerSecond," secs");

     s = getUTCtime;
     sum = fp_dmd_asm;
     e = getUTCtime;
     writefln("dmd asm = ",sum,": ",(e-s)/cast(real)TicksPerSecond," secs");

     s = getUTCtime;
     sum = fp_dmd_opt;
     e = getUTCtime;
     writefln("opt asm = ",sum,": ",(e-s)/cast(real)TicksPerSecond," secs");

     s = getUTCtime;
     sum = fp_gdc_asm;
     e = getUTCtime;
     writefln("gdc asm = ",sum,": ",(e-s)/cast(real)TicksPerSecond," secs");
}

double fp()
{
     double sum = 1.0, val = 0.000001;
     for(size_t i = 0; i < 10_000_000; i++)
     {
         sum += val;
         sum -= val;
         sum *= val;
         sum /= val;
     }
     return sum;
}

double _sum = 1.0, _val = 0.000001;
double fp_dmd_asm() // more or less
{
     asm
     {
                 fld     qword ptr _sum[0];
                 fstp    qword ptr -8[EBP];
                 xor     EAX,EAX;
L11:            fld     qword ptr _val[0];
                 fadd    qword ptr -8[EBP];
                 fstp    qword ptr -8[EBP];
                 fld     qword ptr _val[0];
                 fsubr   qword ptr -8[EBP];
                 fstp    qword ptr -8[EBP];
                 fld     qword ptr _val[0];
                 fmul    qword ptr -8[EBP];
                 fstp    qword ptr -8[EBP];
                 fld     qword ptr _val[0];
                 fdivr   qword ptr -8[EBP];
                 fstp    qword ptr -8[EBP];
                 inc     EAX;
                 cmp     EAX,10_000_000;
                 jb      L11;
                 fld     qword ptr -8[EBP];
                 fstp    _sum[0];
     }
     return _sum;
}

double fp_dmd_opt()
{
     double sum = 1.0, val = 0.000001;
     asm
     {
                 fld     qword ptr sum[0];
                 fld     qword ptr val[0];
                 fxch    ST(1);
                 xor     EAX,EAX;
L1:             fadd    ST, ST(1);
                 fsubr   ST, ST(1);
                 fmul    ST, ST(1);
                 fdivr   ST, ST(1);
                 inc     EAX;
                 cmp     EAX,10_000_000;
                 jb      L1;
                 fstp    sum[0];
                 fstp    val[0];
     }
     return sum;
}

double fp_gdc_asm() // more or less
{
     double sum = 1.0, val = 0.000001;
     asm
     {
                 xor     EAX,EAX;
                 fld     qword ptr val[0];
                 fld     qword ptr sum[0];
L1:             fadd    ST, ST(1);
                 fsubr   ST, ST(1);
                 fmul    ST, ST(1);
                 fdivr   ST, ST(1);
                 inc     EAX;
                 cmp     EAX,10_000_000;
                 jb      L1;
                 fstp    sum[0];
                 fstp    val[0];
     }
     return sum;
}