[Issue 18627] std.complex is a lot slower than builtin complex types at number crunching

Fri Apr 16 14:45:17 UTC 2021

https://issues.dlang.org/show_bug.cgi?id=18627

Iain Buclaw <ibuclaw at gdcproject.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|RESOLVED                    |REOPENED
         Resolution|FIXED                       |---

--- Comment #15 from Iain Buclaw <ibuclaw at gdcproject.org> ---
Not sure if this should really be marked as resolved/fixed, but anyhow...

With the following (lazy) function generator:
---
import std.complex : C = Complex;
import std.meta : AliasSeq;
import std.format : format;

static foreach (T; AliasSeq!(cfloat, cdouble, creal))
{
    // Unary operators
    mixin(format!"%s %s_unary_add(%s a) { return +a; }"
          (T.stringof, T.stringof, T.stringof));
    mixin(format!"%s %s_unary_sub(%s a) { return -a; }"
          (T.stringof, T.stringof, T.stringof));

    // Binary operators
    mixin(format!"%s %s_binary_add(%s a, %s b) { return a + b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
    mixin(format!"%s %s_binary_sub(%s a, %s b) { return a - b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
    mixin(format!"%s %s_binary_mul(%s a, %s b) { return a * b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
    mixin(format!"%s %s_binary_div(%s a, %s b) { return a / b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
}

static foreach (T; AliasSeq!(float, double, real))
{
    // Unary operators
    mixin(format!"C!%s std_c%s_unary_add(C!%s a) { return +a; }"
          (T.stringof, T.stringof, T.stringof));
    mixin(format!"C!%s std_c%s_unary_sub(C!%s a) { return -a; }"
          (T.stringof, T.stringof, T.stringof));

    // Binary operators
    mixin(format!"C!%s std_c%s_binary_add(C!%s a, C!%s b) { return a + b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
    mixin(format!"C!%s std_c%s_binary_sub(C!%s a, C!%s b) { return a - b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
    mixin(format!"C!%s std_c%s_binary_mul(C!%s a, C!%s b) { return a * b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
    mixin(format!"C!%s std_c%s_binary_div(C!%s a, C!%s b) { return a / b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
}
---

On x86_64/GDC, the results are:
========================================

cfloat_unary_add:
        movq    %xmm0, -8(%rsp)
        movss   -8(%rsp), %xmm0
        movss   %xmm0, -16(%rsp)
        movss   -4(%rsp), %xmm0
        movss   %xmm0, -12(%rsp)
        movq    -16(%rsp), %xmm0
        ret
---
std_cfloat_unary_add:
        ret

========================================

cdouble_unary_add:
        ret
---
std_cdouble_unary_add:
        ret

========================================

creal_unary_add:
        fldt    8(%rsp)
        fldt    24(%rsp)
        fxch    %st(1)
        ret
---
std_creal_unary_add:
        movdqa  8(%rsp), %xmm0
        movdqa  24(%rsp), %xmm1
        movq    %rdi, %rax
        movaps  %xmm0, (%rdi)
        movaps  %xmm1, 16(%rdi)
        ret

========================================

cfloat_unary_sub:
        movq    %xmm0, -8(%rsp)
        movss   -8(%rsp), %xmm0
        movss   .LC4(%rip), %xmm2
        movaps  %xmm0, %xmm1
        movss   -4(%rsp), %xmm0
        xorps   %xmm2, %xmm1
        xorps   %xmm2, %xmm0
        movss   %xmm1, -16(%rsp)
        movss   %xmm0, -12(%rsp)
        movq    -16(%rsp), %xmm0
        ret
.LC4:
        .long   -2147483648
        .long   0
        .long   0
        .long   0
---
std_cfloat_unary_sub:
        movq    .LC7(%rip), %xmm1
        xorps   %xmm1, %xmm0
        ret
.LC7:
        .long   -2147483648
        .long   -2147483648

========================================

cdouble_unary_sub:
        movq    .LC5(%rip), %xmm2
        xorpd   %xmm2, %xmm1
        xorpd   %xmm2, %xmm0
        ret
.LC5:
        .long   0
        .long   -2147483648
        .long   0
        .long   0
---
std_cdouble_unary_sub:
        movq    %xmm0, -24(%rsp)
        movq    %xmm1, -16(%rsp)
        movapd  -24(%rsp), %xmm2
        xorpd   .LC8(%rip), %xmm2
        movaps  %xmm2, -24(%rsp)
        movsd   -16(%rsp), %xmm1
        movsd   -24(%rsp), %xmm0
        ret
.LC8:
        .long   0
        .long   -2147483648
        .long   0
        .long   -2147483648

========================================

creal_unary_sub:
        fldt    8(%rsp)
        fchs
        fldt    24(%rsp)
        fchs
        fxch    %st(1)
        ret
---
std_creal_unary_sub:
        fldt    24(%rsp)
        movq    %rdi, %rax
        fchs
        fldt    8(%rsp)
        fchs
        fstpt   (%rdi)
        fstpt   16(%rdi)
        ret

========================================

cfloat_binary_add:
        movq    %xmm0, -8(%rsp)
        movq    %xmm1, -16(%rsp)
        movss   -8(%rsp), %xmm1
        movss   -16(%rsp), %xmm0
        addss   %xmm0, %xmm1
        movss   -12(%rsp), %xmm0
        addss   -4(%rsp), %xmm0
        movss   %xmm1, -24(%rsp)
        movss   %xmm0, -20(%rsp)
        movq    -24(%rsp), %xmm0
        ret
---
std_cfloat_binary_add:
        addps   %xmm1, %xmm0
        ret

========================================

cdouble_binary_add:
        addsd   %xmm3, %xmm1
        addsd   %xmm2, %xmm0
        ret
---
std_cdouble_binary_add:
        movq    %xmm0, -40(%rsp)
        movq    %xmm1, -32(%rsp)
        movq    %xmm2, -24(%rsp)
        movq    %xmm3, -16(%rsp)
        movapd  -24(%rsp), %xmm4
        addpd   -40(%rsp), %xmm4
        movaps  %xmm4, -40(%rsp)
        movsd   -32(%rsp), %xmm1
        movsd   -40(%rsp), %xmm0
        ret

========================================

creal_binary_add:
        fldt    8(%rsp)
        fldt    40(%rsp)
        faddp   %st, %st(1)
        fldt    24(%rsp)
        fldt    56(%rsp)
        faddp   %st, %st(1)
        fxch    %st(1)
        ret
---
std_creal_binary_add:
        fldt    24(%rsp)
        movq    %rdi, %rax
        fldt    56(%rsp)
        faddp   %st, %st(1)
        fldt    40(%rsp)
        fldt    8(%rsp)
        faddp   %st, %st(1)
        fstpt   (%rdi)
        fstpt   16(%rdi)
        ret

========================================

cfloat_binary_sub:
        movq    %xmm0, -8(%rsp)
        movss   -8(%rsp), %xmm0
        movq    %xmm1, -16(%rsp)
        movaps  %xmm0, %xmm1
        movss   -4(%rsp), %xmm0
        subss   -16(%rsp), %xmm1
        subss   -12(%rsp), %xmm0
        movss   %xmm1, -24(%rsp)
        movss   %xmm0, -20(%rsp)
        movq    -24(%rsp), %xmm0
        ret
---
std_cfloat_binary_sub:
        subps   %xmm1, %xmm0
        ret

========================================

cdouble_binary_sub:
        subsd   %xmm3, %xmm1
        subsd   %xmm2, %xmm0
        ret
---
std_cdouble_binary_sub:
        movq    %xmm0, -40(%rsp)
        movq    %xmm1, -32(%rsp)
        movapd  -40(%rsp), %xmm4
        movq    %xmm2, -24(%rsp)
        movq    %xmm3, -16(%rsp)
        subpd   -24(%rsp), %xmm4
        movaps  %xmm4, -40(%rsp)
        movsd   -32(%rsp), %xmm1
        movsd   -40(%rsp), %xmm0
        ret

========================================

creal_binary_sub:
        fldt    8(%rsp)
        fldt    40(%rsp)
        fsubrp  %st, %st(1)
        fldt    24(%rsp)
        fldt    56(%rsp)
        fsubrp  %st, %st(1)
        fxch    %st(1)
        ret
---
std_creal_binary_sub:
        fldt    24(%rsp)
        movq    %rdi, %rax
        fldt    56(%rsp)
        fsubrp  %st, %st(1)
        fldt    8(%rsp)
        fldt    40(%rsp)
        fsubrp  %st, %st(1)
        fstpt   (%rdi)
        fstpt   16(%rdi)
        ret

========================================

cfloat_binary_mul:
        movq    %xmm0, -8(%rsp)
        movss   -8(%rsp), %xmm0
        movss   -4(%rsp), %xmm2
        movq    %xmm1, -16(%rsp)
        movss   -16(%rsp), %xmm3
        movss   -12(%rsp), %xmm4
        movaps  %xmm0, %xmm1
        movaps  %xmm2, %xmm5
        mulss   %xmm3, %xmm1
        mulss   %xmm4, %xmm5
        mulss   %xmm4, %xmm0
        mulss   %xmm3, %xmm2
        subss   %xmm5, %xmm1
        addss   %xmm2, %xmm0
        movss   %xmm1, -24(%rsp)
        movss   %xmm0, -20(%rsp)
        movq    -24(%rsp), %xmm0
        ret
---
std_cfloat_binary_mul:
        movdqa  %xmm0, %xmm2
        movaps  %xmm1, %xmm0
        shufps  $0xe5, %xmm1, %xmm1
        shufps  $0xe0, %xmm0, %xmm0
        mulps   %xmm2, %xmm0
        shufps  $0xe1, %xmm2, %xmm2
        mulps   %xmm1, %xmm2
        movaps  %xmm0, %xmm1
        subps   %xmm2, %xmm1
        addps   %xmm2, %xmm0
        movss   %xmm1, %xmm0
        ret

========================================

cdouble_binary_mul:
        movapd  %xmm0, %xmm4
        movapd  %xmm1, %xmm5
        mulsd   %xmm2, %xmm0
        mulsd   %xmm3, %xmm5
        mulsd   %xmm3, %xmm4
        mulsd   %xmm2, %xmm1
        subsd   %xmm5, %xmm0
        addsd   %xmm4, %xmm1
        ret
---
std_cdouble_binary_mul:
        movq    %xmm2, -40(%rsp)
        movq    %xmm3, -32(%rsp)
        movapd  -40(%rsp), %xmm2
        movq    %xmm1, -16(%rsp)
        movapd  -40(%rsp), %xmm1
        movq    %xmm0, -24(%rsp)
        movapd  -24(%rsp), %xmm0
        unpcklpd        %xmm2, %xmm2
        mulpd   -24(%rsp), %xmm2
        unpckhpd        %xmm1, %xmm1
        shufpd  $1, %xmm0, %xmm0
        mulpd   %xmm1, %xmm0
        movapd  %xmm2, %xmm1
        subpd   %xmm0, %xmm1
        addpd   %xmm0, %xmm2
        movsd   %xmm1, %xmm2
        movaps  %xmm2, -40(%rsp)
        movsd   -32(%rsp), %xmm1
        movsd   -40(%rsp), %xmm0
        ret

========================================

creal_binary_mul:
        fldt    8(%rsp)
        fldt    24(%rsp)
        fldt    40(%rsp)
        fldt    56(%rsp)
        fld     %st(3)
        fmul    %st(2), %st
        fld     %st(3)
        fmul    %st(2), %st
        fsubrp  %st, %st(1)
        fxch    %st(4)
        fmulp   %st, %st(1)
        fxch    %st(2)
        fmulp   %st, %st(1)
        faddp   %st, %st(1)
        fxch    %st(1)
        ret
---
std_creal_binary_mul:
        fldt    40(%rsp)
        movq    %rdi, %rax
        fldt    56(%rsp)
        fldt    24(%rsp)
        fldt    8(%rsp)
        fld     %st(3)
        fmul    %st(1), %st
        fld     %st(2)
        fmul    %st(4), %st
        fsubrp  %st, %st(1)
        fstpt   (%rdi)
        fxch    %st(3)
        fmulp   %st, %st(1)
        fxch    %st(2)
        fmulp   %st, %st(1)
        faddp   %st, %st(1)
        fstpt   16(%rdi)
        ret

========================================

cfloat_binary_div:
        movq    %xmm1, -16(%rsp)
        movss   -16(%rsp), %xmm5
        movss   -12(%rsp), %xmm4
        movq    %xmm0, -8(%rsp)
        movss   -8(%rsp), %xmm3
        movss   -4(%rsp), %xmm0
        movaps  %xmm5, %xmm2
        movaps  %xmm4, %xmm1
        mulss   %xmm4, %xmm1
        movaps  %xmm0, %xmm6
        mulss   %xmm5, %xmm2
        mulss   %xmm4, %xmm6
        mulss   %xmm5, %xmm0
        addss   %xmm1, %xmm2
        movaps  %xmm3, %xmm1
        mulss   %xmm5, %xmm1
        mulss   %xmm4, %xmm3
        addss   %xmm6, %xmm1
        subss   %xmm3, %xmm0
        divss   %xmm2, %xmm1
        divss   %xmm2, %xmm0
        movss   %xmm1, -24(%rsp)
        movss   %xmm0, -20(%rsp)
        movq    -24(%rsp), %xmm0
        ret
---
std_cfloat_binary_div:
        movq    %xmm1, %rax
        movdqa  %xmm1, %xmm2
        movdqa  %xmm0, %xmm3
        shrq    $32, %rax
        movaps  %xmm2, %xmm4
        mulss   %xmm2, %xmm4
        movd    %eax, %xmm1
        movq    %xmm0, %rax
        movaps  %xmm1, %xmm0
        shrq    $32, %rax
        mulss   %xmm1, %xmm0
        movq    %rax, %xmm5
        movd    %eax, %xmm6
        mulss   %xmm1, %xmm6
        addss   %xmm0, %xmm4
        movaps  %xmm2, %xmm0
        mulss   %xmm3, %xmm0
        mulss   %xmm5, %xmm2
        mulss   %xmm1, %xmm3
        addss   %xmm6, %xmm0
        subss   %xmm3, %xmm2
        divss   %xmm4, %xmm0
        divss   %xmm4, %xmm2
        unpcklps        %xmm2, %xmm0
        ret

========================================

cdouble_binary_div:
        movapd  %xmm0, %xmm4
        movapd  %xmm2, %xmm5
        movapd  %xmm3, %xmm0
        mulsd   %xmm3, %xmm0
        movapd  %xmm1, %xmm6
        mulsd   %xmm2, %xmm5
        mulsd   %xmm3, %xmm6
        mulsd   %xmm2, %xmm1
        addsd   %xmm0, %xmm5
        movapd  %xmm4, %xmm0
        mulsd   %xmm2, %xmm0
        mulsd   %xmm3, %xmm4
        addsd   %xmm6, %xmm0
        subsd   %xmm4, %xmm1
        divsd   %xmm5, %xmm0
        divsd   %xmm5, %xmm1
        ret
---
std_cdouble_binary_div:
        movq    %xmm2, -40(%rsp)
        movsd   -40(%rsp), %xmm2
        movq    %xmm3, -32(%rsp)
        movapd  -40(%rsp), %xmm3
        movsd   -32(%rsp), %xmm4
        movq    %xmm1, -16(%rsp)
        movapd  -40(%rsp), %xmm1
        mulsd   %xmm2, %xmm2
        movq    %xmm0, -24(%rsp)
        mulsd   %xmm4, %xmm4
        unpcklpd        %xmm3, %xmm3
        movapd  -24(%rsp), %xmm0
        mulpd   -24(%rsp), %xmm3
        unpckhpd        %xmm1, %xmm1
        shufpd  $1, %xmm0, %xmm0
        mulpd   %xmm1, %xmm0
        addsd   %xmm4, %xmm2
        movapd  %xmm3, %xmm1
        addpd   %xmm0, %xmm1
        subpd   %xmm0, %xmm3
        unpcklpd        %xmm2, %xmm2
        movsd   %xmm1, %xmm3
        divpd   %xmm2, %xmm3
        movaps  %xmm3, -40(%rsp)
        movsd   -32(%rsp), %xmm1
        movsd   -40(%rsp), %xmm0
        ret

========================================

creal_binary_div:
        fldt    8(%rsp)
        fldt    24(%rsp)
        fldt    40(%rsp)
        fldt    56(%rsp)
        fld     %st(1)
        fmul    %st(2), %st
        fld     %st(1)
        fmul    %st(2), %st
        faddp   %st, %st(1)
        fld     %st(4)
        fmul    %st(3), %st
        fld     %st(4)
        fmul    %st(3), %st
        faddp   %st, %st(1)
        fdiv    %st(1), %st
        fxch    %st(4)
        fmulp   %st, %st(3)
        fxch    %st(4)
        fmulp   %st, %st(1)
        fsubrp  %st, %st(1)
        fdivp   %st, %st(2)
        ret
---
std_creal_binary_div:
        fldt    40(%rsp)
        movq    %rdi, %rax
        fldt    56(%rsp)
        fldt    24(%rsp)
        fldt    8(%rsp)
        fld     %st(3)
        fmul    %st(4), %st
        fld     %st(3)
        fmul    %st(4), %st
        faddp   %st, %st(1)
        fld     %st(4)
        fmul    %st(2), %st
        fld     %st(3)
        fmul    %st(5), %st
        faddp   %st, %st(1)
        fdiv    %st(1), %st
        fstpt   (%rdi)
        fxch    %st(4)
        fmulp   %st, %st(2)
        fmulp   %st, %st(2)
        fsubp   %st, %st(1)
        fdivp   %st, %st(1)
        fstpt   16(%rdi)
        ret

========================================

Just visually comparing:

- cfloat -> Complex!float looks to be neglible.
- creal -> Complex!real just adds a small overhead of moving data on/off ST
registers (this is expected, and not a performance bug).
- cdouble -> Complex!double, it may look like cdouble still has a small edge,
however the use of *pd instructions on the std.complex would infact make it
quicker (i.e: one divpd is 2x faster than two divsd instructions in the
cdouble_binary_div functions).

I actually found that LLVM seemed for able to pick-up the intent of the
FastMath complex divide functions, so LDC might give a more pleasing output.

Benchmarks to follow soon...

--