Performance

Fri May 30 08:50:40 PDT 2014

On 5/30/2014 9:30 AM, bearophile wrote:
>> double plus(in uint nSteps) pure nothrow @safe /*@nogc*/ {
>>     enum double p0 = 0.0045;
>>     enum double p1 = 1.00045452-p0;
>>
>>     double tot = 1.346346;
>>     auto b = true;
>>
>>     foreach (immutable i; 0 .. nSteps) {
>>         final switch (b) {
>>             case true:
>>                 tot += p0;
>>                 break;
>>             case false:
>>                 tot += p1;
>>                 break;
>>         }
>>
>>         b = !b;
>>     }
>>
>>     return tot;
>> }
>
> And this is the 32 bit X86 asm generated by ldc2 for the plus function:
>
> __D4test4plusFNaNbNfxkZd:
>      pushl    %ebp
>      movl    %esp, %ebp
>      pushl    %esi
>      andl    $-8, %esp
>      subl    $24, %esp
>      movsd    LCPI0_0, %xmm0
>      testl    %eax, %eax
>      je    LBB0_8
>      xorl    %ecx, %ecx
>      movb    $1, %dl
>      movsd    LCPI0_1, %xmm1
>      movsd    LCPI0_2, %xmm2
>      .align    16, 0x90
> LBB0_2:
>      testb    $1, %dl
>      jne    LBB0_3
>      addsd    %xmm1, %xmm0
>      jmp    LBB0_7
>      .align    16, 0x90
> LBB0_3:
>      movzbl    %dl, %esi
>      andl    $1, %esi
>      je    LBB0_5
>      addsd    %xmm2, %xmm0
> LBB0_7:
>      xorb    $1, %dl
>      incl    %ecx
>      cmpl    %eax, %ecx
>      jb    LBB0_2
> LBB0_8:
>      movsd    %xmm0, 8(%esp)
>      fldl    8(%esp)
>      leal    -4(%ebp), %esp
>      popl    %esi
>      popl    %ebp
>      ret
> LBB0_5:
>      movl    $11, 4(%esp)
>      movl    $__D4test12__ModuleInfoZ, (%esp)
>      calll    __d_switch_error
>
> Bye,
> bearophile

Well, I'd argue that in fact neither the C++ nor D code generated the
fastest possible code here, as this code will result in at least 3,
likely more, potentially even every, branch being mispredicted. I would
argue, after checking the throughput numbers for fadd (only checked
haswell), that the fastest code here would actually compute both sides
of the branch and use a set of 4 cmov's (due to the fact it's x86 and
we're working with doubles) to determine which one is the one we need to
use going forward.