[Issue 10707] New: Add to std.complex some optional high level SIMD code
d-bugmail at puremagic.com
d-bugmail at puremagic.com
Tue Jul 23 13:41:35 PDT 2013
http://d.puremagic.com/issues/show_bug.cgi?id=10707
Summary: Add to std.complex some optional high level SIMD code
Product: D
Version: D2
Platform: All
OS/Version: All
Status: NEW
Severity: enhancement
Priority: P2
Component: Phobos
AssignedTo: nobody at puremagic.com
ReportedBy: bearophile_hugs at eml.cc
--- Comment #0 from bearophile_hugs at eml.cc 2013-07-23 13:41:33 PDT ---
This simple D benchmark shows a complex number multiplication done with
std.complex, using double2 and SIMD instructions, and the same wrapped in a
struct:
import std.stdio, core.simd, ldc.gccbuiltins_x86, std.complex;
alias ComplexD = Complex!double;
ComplexD mult1(in ComplexD x, in ComplexD y) {
return x * y;
}
double2 mult2(double2 a, double2 b) {
double2 b_flip = [b.array[1], b.array[0]]; // Swap b.re and b.im
double2 a_im = [a.array[1], a.array[1]]; // Imag. part of a in both
double2 a_re = [a.array[0], a.array[0]]; // Real part of a in both
double2 aib = a_im * b_flip; // (a.im*b.im, a.im*b.re)
double2 arb = a_re * b; // (a.re*b.re, a.re*b.im)
return __builtin_ia32_addsubpd(arb, aib); // subtract/add
}
struct ComplexS {
union {
double2 x;
struct {
double re, im;
}
}
alias x this;
}
ComplexS mult3(ComplexS a, ComplexS b) {
double2 b_flip = [b.im, b.re];
double2 a_im = [a.im, a.im];
double2 a_re = [a.re, a.re];
double2 aib = a_im * b_flip;
double2 arb = a_re * b;
return ComplexS(__builtin_ia32_addsubpd(arb, aib));
}
void main() {
const c1 = ComplexD(1.0, 30.0);
const c2 = ComplexD(500.0, 7000.0);
mult1(c1, c2).writeln;
double2 x1 = [1.0, 30.0];
double2 x2 = [500.0, 7000.0];
mult2(x1, x2).array.writeln;
auto x1s = ComplexS([1.0, 30.0]);
auto x2s = ComplexS([500.0, 7000.0]);
mult3(x1s, x2s).array.writeln;
}
If I compile the code with a compiler designed for floating point performance
(ldc2 v. 0.11.0 based on DMD v2.062 and LLVM 3.3svn) with:
ldmd2 -O -release -inline -noboundscheck -mattr=sse3 -output-s test_complex.d
I get the 32bit x86 with SS3 asm:
__D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex:
movsd 20(%esp), %xmm0
movsd 28(%esp), %xmm3
movsd 4(%esp), %xmm1
movsd 12(%esp), %xmm2
movaps %xmm2, %xmm4
mulsd %xmm3, %xmm4
movaps %xmm1, %xmm5
mulsd %xmm0, %xmm5
subsd %xmm4, %xmm5
movsd %xmm5, (%eax)
mulsd %xmm3, %xmm1
mulsd %xmm0, %xmm2
addsd %xmm1, %xmm2
movsd %xmm2, 8(%eax)
ret $32
__D12test_complex5mult2FNhG2dNhG2dZNhG2d:
pshufd $238, %xmm1, %xmm3
pshufd $78, %xmm0, %xmm2
mulpd %xmm3, %xmm2
pshufd $68, %xmm1, %xmm1
mulpd %xmm0, %xmm1
addsubpd %xmm2, %xmm1
movapd %xmm1, %xmm0
ret
__D12test_complex5mult3FS12test_complex8ComplexSS12test_complex8ComplexSZS12test_complex8ComplexS:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $16, %esp
movsd 16(%ebp), %xmm1
movhpd 8(%ebp), %xmm1
movddup 32(%ebp), %xmm0
mulpd %xmm1, %xmm0
movddup 24(%ebp), %xmm1
mulpd 8(%ebp), %xmm1
addsubpd %xmm0, %xmm1
movupd %xmm1, (%eax)
movl %ebp, %esp
popl %ebp
ret $32
Notes:
- mult2 is quite more efficient than mult1.
- Maybe there is a way to remove one asm instruction from mult2, the last
movapd, if the precedent instruction addsubpd is done on %xmm2 and %xmm0 and
other instruction registers are swapped.
- Maybe the struct ComplexS is badly designed.
For that code I have used the SS3 SIMD extension, that today is very common for
all kind of Intel compatible CPUs. Using the AVX2 ldc2 produces:
__D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex:
vmovsd 20(%esp), %xmm0
vmovsd 28(%esp), %xmm1
vmovsd 4(%esp), %xmm2
vmovsd 12(%esp), %xmm3
vmulsd %xmm1, %xmm3, %xmm4
vmulsd %xmm0, %xmm2, %xmm5
vsubsd %xmm4, %xmm5, %xmm4
vmovsd %xmm4, (%eax)
vmulsd %xmm1, %xmm2, %xmm1
vmulsd %xmm0, %xmm3, %xmm0
vaddsd %xmm1, %xmm0, %xmm0
vmovsd %xmm0, 8(%eax)
ret $32
__D12test_complex5mult2FNhG2dNhG2dZNhG2d:
vpermilpd $3, %xmm1, %xmm2
vpermilpd $1, %xmm0, %xmm3
vmulpd %xmm2, %xmm3, %xmm2
vpbroadcastq %xmm1, %xmm1
vmulpd %xmm0, %xmm1, %xmm0
vaddsubpd %xmm2, %xmm0, %xmm0
ret
If I add -vectorize-slp -vectorize-slp-aggressive to the compilation switches
the asm of mult1 gets a bit better:
__D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex:
vmovsd 20(%esp), %xmm0
vmovsd 28(%esp), %xmm1
vmovsd 4(%esp), %xmm2
vmovsd 12(%esp), %xmm3
vmulsd %xmm1, %xmm2, %xmm4
vmulsd %xmm0, %xmm3, %xmm5
vaddsd %xmm4, %xmm5, %xmm4
vmulsd %xmm1, %xmm3, %xmm1
vmulsd %xmm0, %xmm2, %xmm0
vsubsd %xmm1, %xmm0, %xmm0
vunpcklpd %xmm4, %xmm0, %xmm0
vmovupd %xmm0, (%eax)
ret $32
So I suggest to add to std.complex some high level SIMD code like mult2, that
gets compiled if the target CPU supports SS3 instructions.
--
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
More information about the Digitalmars-d-bugs
mailing list