[Issue 18627] std.complex is a lot slower than builtin complex types at number crunching
d-bugmail at puremagic.com
d-bugmail at puremagic.com
Fri Apr 16 14:45:17 UTC 2021
https://issues.dlang.org/show_bug.cgi?id=18627
Iain Buclaw <ibuclaw at gdcproject.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|RESOLVED |REOPENED
Resolution|FIXED |---
--- Comment #15 from Iain Buclaw <ibuclaw at gdcproject.org> ---
Not sure if this should really be marked as resolved/fixed, but anyhow...
With the following (lazy) function generator:
---
import std.complex : C = Complex;
import std.meta : AliasSeq;
import std.format : format;
static foreach (T; AliasSeq!(cfloat, cdouble, creal))
{
// Unary operators
mixin(format!"%s %s_unary_add(%s a) { return +a; }"
(T.stringof, T.stringof, T.stringof));
mixin(format!"%s %s_unary_sub(%s a) { return -a; }"
(T.stringof, T.stringof, T.stringof));
// Binary operators
mixin(format!"%s %s_binary_add(%s a, %s b) { return a + b; }"
(T.stringof, T.stringof, T.stringof, T.stringof));
mixin(format!"%s %s_binary_sub(%s a, %s b) { return a - b; }"
(T.stringof, T.stringof, T.stringof, T.stringof));
mixin(format!"%s %s_binary_mul(%s a, %s b) { return a * b; }"
(T.stringof, T.stringof, T.stringof, T.stringof));
mixin(format!"%s %s_binary_div(%s a, %s b) { return a / b; }"
(T.stringof, T.stringof, T.stringof, T.stringof));
}
static foreach (T; AliasSeq!(float, double, real))
{
// Unary operators
mixin(format!"C!%s std_c%s_unary_add(C!%s a) { return +a; }"
(T.stringof, T.stringof, T.stringof));
mixin(format!"C!%s std_c%s_unary_sub(C!%s a) { return -a; }"
(T.stringof, T.stringof, T.stringof));
// Binary operators
mixin(format!"C!%s std_c%s_binary_add(C!%s a, C!%s b) { return a + b; }"
(T.stringof, T.stringof, T.stringof, T.stringof));
mixin(format!"C!%s std_c%s_binary_sub(C!%s a, C!%s b) { return a - b; }"
(T.stringof, T.stringof, T.stringof, T.stringof));
mixin(format!"C!%s std_c%s_binary_mul(C!%s a, C!%s b) { return a * b; }"
(T.stringof, T.stringof, T.stringof, T.stringof));
mixin(format!"C!%s std_c%s_binary_div(C!%s a, C!%s b) { return a / b; }"
(T.stringof, T.stringof, T.stringof, T.stringof));
}
---
On x86_64/GDC, the results are:
========================================
cfloat_unary_add:
movq %xmm0, -8(%rsp)
movss -8(%rsp), %xmm0
movss %xmm0, -16(%rsp)
movss -4(%rsp), %xmm0
movss %xmm0, -12(%rsp)
movq -16(%rsp), %xmm0
ret
---
std_cfloat_unary_add:
ret
========================================
cdouble_unary_add:
ret
---
std_cdouble_unary_add:
ret
========================================
creal_unary_add:
fldt 8(%rsp)
fldt 24(%rsp)
fxch %st(1)
ret
---
std_creal_unary_add:
movdqa 8(%rsp), %xmm0
movdqa 24(%rsp), %xmm1
movq %rdi, %rax
movaps %xmm0, (%rdi)
movaps %xmm1, 16(%rdi)
ret
========================================
cfloat_unary_sub:
movq %xmm0, -8(%rsp)
movss -8(%rsp), %xmm0
movss .LC4(%rip), %xmm2
movaps %xmm0, %xmm1
movss -4(%rsp), %xmm0
xorps %xmm2, %xmm1
xorps %xmm2, %xmm0
movss %xmm1, -16(%rsp)
movss %xmm0, -12(%rsp)
movq -16(%rsp), %xmm0
ret
.LC4:
.long -2147483648
.long 0
.long 0
.long 0
---
std_cfloat_unary_sub:
movq .LC7(%rip), %xmm1
xorps %xmm1, %xmm0
ret
.LC7:
.long -2147483648
.long -2147483648
========================================
cdouble_unary_sub:
movq .LC5(%rip), %xmm2
xorpd %xmm2, %xmm1
xorpd %xmm2, %xmm0
ret
.LC5:
.long 0
.long -2147483648
.long 0
.long 0
---
std_cdouble_unary_sub:
movq %xmm0, -24(%rsp)
movq %xmm1, -16(%rsp)
movapd -24(%rsp), %xmm2
xorpd .LC8(%rip), %xmm2
movaps %xmm2, -24(%rsp)
movsd -16(%rsp), %xmm1
movsd -24(%rsp), %xmm0
ret
.LC8:
.long 0
.long -2147483648
.long 0
.long -2147483648
========================================
creal_unary_sub:
fldt 8(%rsp)
fchs
fldt 24(%rsp)
fchs
fxch %st(1)
ret
---
std_creal_unary_sub:
fldt 24(%rsp)
movq %rdi, %rax
fchs
fldt 8(%rsp)
fchs
fstpt (%rdi)
fstpt 16(%rdi)
ret
========================================
cfloat_binary_add:
movq %xmm0, -8(%rsp)
movq %xmm1, -16(%rsp)
movss -8(%rsp), %xmm1
movss -16(%rsp), %xmm0
addss %xmm0, %xmm1
movss -12(%rsp), %xmm0
addss -4(%rsp), %xmm0
movss %xmm1, -24(%rsp)
movss %xmm0, -20(%rsp)
movq -24(%rsp), %xmm0
ret
---
std_cfloat_binary_add:
addps %xmm1, %xmm0
ret
========================================
cdouble_binary_add:
addsd %xmm3, %xmm1
addsd %xmm2, %xmm0
ret
---
std_cdouble_binary_add:
movq %xmm0, -40(%rsp)
movq %xmm1, -32(%rsp)
movq %xmm2, -24(%rsp)
movq %xmm3, -16(%rsp)
movapd -24(%rsp), %xmm4
addpd -40(%rsp), %xmm4
movaps %xmm4, -40(%rsp)
movsd -32(%rsp), %xmm1
movsd -40(%rsp), %xmm0
ret
========================================
creal_binary_add:
fldt 8(%rsp)
fldt 40(%rsp)
faddp %st, %st(1)
fldt 24(%rsp)
fldt 56(%rsp)
faddp %st, %st(1)
fxch %st(1)
ret
---
std_creal_binary_add:
fldt 24(%rsp)
movq %rdi, %rax
fldt 56(%rsp)
faddp %st, %st(1)
fldt 40(%rsp)
fldt 8(%rsp)
faddp %st, %st(1)
fstpt (%rdi)
fstpt 16(%rdi)
ret
========================================
cfloat_binary_sub:
movq %xmm0, -8(%rsp)
movss -8(%rsp), %xmm0
movq %xmm1, -16(%rsp)
movaps %xmm0, %xmm1
movss -4(%rsp), %xmm0
subss -16(%rsp), %xmm1
subss -12(%rsp), %xmm0
movss %xmm1, -24(%rsp)
movss %xmm0, -20(%rsp)
movq -24(%rsp), %xmm0
ret
---
std_cfloat_binary_sub:
subps %xmm1, %xmm0
ret
========================================
cdouble_binary_sub:
subsd %xmm3, %xmm1
subsd %xmm2, %xmm0
ret
---
std_cdouble_binary_sub:
movq %xmm0, -40(%rsp)
movq %xmm1, -32(%rsp)
movapd -40(%rsp), %xmm4
movq %xmm2, -24(%rsp)
movq %xmm3, -16(%rsp)
subpd -24(%rsp), %xmm4
movaps %xmm4, -40(%rsp)
movsd -32(%rsp), %xmm1
movsd -40(%rsp), %xmm0
ret
========================================
creal_binary_sub:
fldt 8(%rsp)
fldt 40(%rsp)
fsubrp %st, %st(1)
fldt 24(%rsp)
fldt 56(%rsp)
fsubrp %st, %st(1)
fxch %st(1)
ret
---
std_creal_binary_sub:
fldt 24(%rsp)
movq %rdi, %rax
fldt 56(%rsp)
fsubrp %st, %st(1)
fldt 8(%rsp)
fldt 40(%rsp)
fsubrp %st, %st(1)
fstpt (%rdi)
fstpt 16(%rdi)
ret
========================================
cfloat_binary_mul:
movq %xmm0, -8(%rsp)
movss -8(%rsp), %xmm0
movss -4(%rsp), %xmm2
movq %xmm1, -16(%rsp)
movss -16(%rsp), %xmm3
movss -12(%rsp), %xmm4
movaps %xmm0, %xmm1
movaps %xmm2, %xmm5
mulss %xmm3, %xmm1
mulss %xmm4, %xmm5
mulss %xmm4, %xmm0
mulss %xmm3, %xmm2
subss %xmm5, %xmm1
addss %xmm2, %xmm0
movss %xmm1, -24(%rsp)
movss %xmm0, -20(%rsp)
movq -24(%rsp), %xmm0
ret
---
std_cfloat_binary_mul:
movdqa %xmm0, %xmm2
movaps %xmm1, %xmm0
shufps $0xe5, %xmm1, %xmm1
shufps $0xe0, %xmm0, %xmm0
mulps %xmm2, %xmm0
shufps $0xe1, %xmm2, %xmm2
mulps %xmm1, %xmm2
movaps %xmm0, %xmm1
subps %xmm2, %xmm1
addps %xmm2, %xmm0
movss %xmm1, %xmm0
ret
========================================
cdouble_binary_mul:
movapd %xmm0, %xmm4
movapd %xmm1, %xmm5
mulsd %xmm2, %xmm0
mulsd %xmm3, %xmm5
mulsd %xmm3, %xmm4
mulsd %xmm2, %xmm1
subsd %xmm5, %xmm0
addsd %xmm4, %xmm1
ret
---
std_cdouble_binary_mul:
movq %xmm2, -40(%rsp)
movq %xmm3, -32(%rsp)
movapd -40(%rsp), %xmm2
movq %xmm1, -16(%rsp)
movapd -40(%rsp), %xmm1
movq %xmm0, -24(%rsp)
movapd -24(%rsp), %xmm0
unpcklpd %xmm2, %xmm2
mulpd -24(%rsp), %xmm2
unpckhpd %xmm1, %xmm1
shufpd $1, %xmm0, %xmm0
mulpd %xmm1, %xmm0
movapd %xmm2, %xmm1
subpd %xmm0, %xmm1
addpd %xmm0, %xmm2
movsd %xmm1, %xmm2
movaps %xmm2, -40(%rsp)
movsd -32(%rsp), %xmm1
movsd -40(%rsp), %xmm0
ret
========================================
creal_binary_mul:
fldt 8(%rsp)
fldt 24(%rsp)
fldt 40(%rsp)
fldt 56(%rsp)
fld %st(3)
fmul %st(2), %st
fld %st(3)
fmul %st(2), %st
fsubrp %st, %st(1)
fxch %st(4)
fmulp %st, %st(1)
fxch %st(2)
fmulp %st, %st(1)
faddp %st, %st(1)
fxch %st(1)
ret
---
std_creal_binary_mul:
fldt 40(%rsp)
movq %rdi, %rax
fldt 56(%rsp)
fldt 24(%rsp)
fldt 8(%rsp)
fld %st(3)
fmul %st(1), %st
fld %st(2)
fmul %st(4), %st
fsubrp %st, %st(1)
fstpt (%rdi)
fxch %st(3)
fmulp %st, %st(1)
fxch %st(2)
fmulp %st, %st(1)
faddp %st, %st(1)
fstpt 16(%rdi)
ret
========================================
cfloat_binary_div:
movq %xmm1, -16(%rsp)
movss -16(%rsp), %xmm5
movss -12(%rsp), %xmm4
movq %xmm0, -8(%rsp)
movss -8(%rsp), %xmm3
movss -4(%rsp), %xmm0
movaps %xmm5, %xmm2
movaps %xmm4, %xmm1
mulss %xmm4, %xmm1
movaps %xmm0, %xmm6
mulss %xmm5, %xmm2
mulss %xmm4, %xmm6
mulss %xmm5, %xmm0
addss %xmm1, %xmm2
movaps %xmm3, %xmm1
mulss %xmm5, %xmm1
mulss %xmm4, %xmm3
addss %xmm6, %xmm1
subss %xmm3, %xmm0
divss %xmm2, %xmm1
divss %xmm2, %xmm0
movss %xmm1, -24(%rsp)
movss %xmm0, -20(%rsp)
movq -24(%rsp), %xmm0
ret
---
std_cfloat_binary_div:
movq %xmm1, %rax
movdqa %xmm1, %xmm2
movdqa %xmm0, %xmm3
shrq $32, %rax
movaps %xmm2, %xmm4
mulss %xmm2, %xmm4
movd %eax, %xmm1
movq %xmm0, %rax
movaps %xmm1, %xmm0
shrq $32, %rax
mulss %xmm1, %xmm0
movq %rax, %xmm5
movd %eax, %xmm6
mulss %xmm1, %xmm6
addss %xmm0, %xmm4
movaps %xmm2, %xmm0
mulss %xmm3, %xmm0
mulss %xmm5, %xmm2
mulss %xmm1, %xmm3
addss %xmm6, %xmm0
subss %xmm3, %xmm2
divss %xmm4, %xmm0
divss %xmm4, %xmm2
unpcklps %xmm2, %xmm0
ret
========================================
cdouble_binary_div:
movapd %xmm0, %xmm4
movapd %xmm2, %xmm5
movapd %xmm3, %xmm0
mulsd %xmm3, %xmm0
movapd %xmm1, %xmm6
mulsd %xmm2, %xmm5
mulsd %xmm3, %xmm6
mulsd %xmm2, %xmm1
addsd %xmm0, %xmm5
movapd %xmm4, %xmm0
mulsd %xmm2, %xmm0
mulsd %xmm3, %xmm4
addsd %xmm6, %xmm0
subsd %xmm4, %xmm1
divsd %xmm5, %xmm0
divsd %xmm5, %xmm1
ret
---
std_cdouble_binary_div:
movq %xmm2, -40(%rsp)
movsd -40(%rsp), %xmm2
movq %xmm3, -32(%rsp)
movapd -40(%rsp), %xmm3
movsd -32(%rsp), %xmm4
movq %xmm1, -16(%rsp)
movapd -40(%rsp), %xmm1
mulsd %xmm2, %xmm2
movq %xmm0, -24(%rsp)
mulsd %xmm4, %xmm4
unpcklpd %xmm3, %xmm3
movapd -24(%rsp), %xmm0
mulpd -24(%rsp), %xmm3
unpckhpd %xmm1, %xmm1
shufpd $1, %xmm0, %xmm0
mulpd %xmm1, %xmm0
addsd %xmm4, %xmm2
movapd %xmm3, %xmm1
addpd %xmm0, %xmm1
subpd %xmm0, %xmm3
unpcklpd %xmm2, %xmm2
movsd %xmm1, %xmm3
divpd %xmm2, %xmm3
movaps %xmm3, -40(%rsp)
movsd -32(%rsp), %xmm1
movsd -40(%rsp), %xmm0
ret
========================================
creal_binary_div:
fldt 8(%rsp)
fldt 24(%rsp)
fldt 40(%rsp)
fldt 56(%rsp)
fld %st(1)
fmul %st(2), %st
fld %st(1)
fmul %st(2), %st
faddp %st, %st(1)
fld %st(4)
fmul %st(3), %st
fld %st(4)
fmul %st(3), %st
faddp %st, %st(1)
fdiv %st(1), %st
fxch %st(4)
fmulp %st, %st(3)
fxch %st(4)
fmulp %st, %st(1)
fsubrp %st, %st(1)
fdivp %st, %st(2)
ret
---
std_creal_binary_div:
fldt 40(%rsp)
movq %rdi, %rax
fldt 56(%rsp)
fldt 24(%rsp)
fldt 8(%rsp)
fld %st(3)
fmul %st(4), %st
fld %st(3)
fmul %st(4), %st
faddp %st, %st(1)
fld %st(4)
fmul %st(2), %st
fld %st(3)
fmul %st(5), %st
faddp %st, %st(1)
fdiv %st(1), %st
fstpt (%rdi)
fxch %st(4)
fmulp %st, %st(2)
fmulp %st, %st(2)
fsubp %st, %st(1)
fdivp %st, %st(1)
fstpt 16(%rdi)
ret
========================================
Just visually comparing:
- cfloat -> Complex!float looks to be neglible.
- creal -> Complex!real just adds a small overhead of moving data on/off ST
registers (this is expected, and not a performance bug).
- cdouble -> Complex!double, it may look like cdouble still has a small edge,
however the use of *pd instructions on the std.complex would infact make it
quicker (i.e: one divpd is 2x faster than two divsd instructions in the
cdouble_binary_div functions).
I actually found that LLVM seemed for able to pick-up the intent of the
FastMath complex divide functions, so LDC might give a more pleasing output.
Benchmarks to follow soon...
--
More information about the Digitalmars-d-bugs
mailing list