[Issue 10707] New: Add to std.complex some optional high level SIMD code

Tue Jul 23 13:41:35 PDT 2013

http://d.puremagic.com/issues/show_bug.cgi?id=10707

           Summary: Add to std.complex some optional high level SIMD code
           Product: D
           Version: D2
          Platform: All
        OS/Version: All
            Status: NEW
          Severity: enhancement
          Priority: P2
         Component: Phobos
        AssignedTo: nobody at puremagic.com
        ReportedBy: bearophile_hugs at eml.cc

--- Comment #0 from bearophile_hugs at eml.cc 2013-07-23 13:41:33 PDT ---
This simple D benchmark shows a complex number multiplication done with
std.complex, using double2 and SIMD instructions, and the same wrapped in a
struct:

import std.stdio, core.simd, ldc.gccbuiltins_x86, std.complex;

alias ComplexD = Complex!double;

ComplexD mult1(in ComplexD x, in ComplexD y) {
    return x * y;
}

double2 mult2(double2 a, double2 b) {
    double2 b_flip = [b.array[1], b.array[0]]; // Swap b.re and b.im
    double2 a_im   = [a.array[1], a.array[1]]; // Imag. part of a in both
    double2 a_re   = [a.array[0], a.array[0]]; // Real part of a in both
    double2 aib    = a_im * b_flip;            // (a.im*b.im, a.im*b.re)
    double2 arb    = a_re * b;                 // (a.re*b.re, a.re*b.im)
    return __builtin_ia32_addsubpd(arb, aib);  // subtract/add
}

struct ComplexS {
    union {
        double2 x;
        struct {
            double re, im;
        }
    }
    alias x this;
}

ComplexS mult3(ComplexS a, ComplexS b) {
    double2 b_flip = [b.im, b.re];
    double2 a_im   = [a.im, a.im];
    double2 a_re   = [a.re, a.re];
    double2 aib    = a_im * b_flip;
    double2 arb    = a_re * b;
    return ComplexS(__builtin_ia32_addsubpd(arb, aib));
}

void main() {
    const c1 = ComplexD(1.0, 30.0);
    const c2 = ComplexD(500.0, 7000.0);
    mult1(c1, c2).writeln;

    double2 x1 = [1.0, 30.0];
    double2 x2 = [500.0, 7000.0];
    mult2(x1, x2).array.writeln;

    auto x1s = ComplexS([1.0, 30.0]);
    auto x2s = ComplexS([500.0, 7000.0]);
    mult3(x1s, x2s).array.writeln;
}

If I compile the code with a compiler designed for floating point performance
(ldc2 v. 0.11.0 based on DMD v2.062 and LLVM 3.3svn) with:

ldmd2 -O -release -inline -noboundscheck -mattr=sse3 -output-s test_complex.d

I get the 32bit x86 with SS3 asm:

__D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex:
    movsd   20(%esp), %xmm0
    movsd   28(%esp), %xmm3
    movsd   4(%esp),  %xmm1
    movsd   12(%esp), %xmm2
    movaps  %xmm2,    %xmm4
    mulsd   %xmm3,    %xmm4
    movaps  %xmm1,    %xmm5
    mulsd   %xmm0,    %xmm5
    subsd   %xmm4,    %xmm5
    movsd   %xmm5,    (%eax)
    mulsd   %xmm3,    %xmm1
    mulsd   %xmm0,    %xmm2
    addsd   %xmm1,    %xmm2
    movsd   %xmm2,    8(%eax)
    ret $32

__D12test_complex5mult2FNhG2dNhG2dZNhG2d:
    pshufd   $238,  %xmm1, %xmm3
    pshufd   $78,   %xmm0, %xmm2
    mulpd    %xmm3, %xmm2
    pshufd   $68,   %xmm1, %xmm1
    mulpd    %xmm0, %xmm1
    addsubpd %xmm2, %xmm1
    movapd   %xmm1, %xmm0
    ret

__D12test_complex5mult3FS12test_complex8ComplexSS12test_complex8ComplexSZS12test_complex8ComplexS:
    pushl    %ebp
    movl     %esp,     %ebp
    andl     $-16,     %esp
    subl     $16,      %esp
    movsd    16(%ebp), %xmm1
    movhpd   8(%ebp),  %xmm1
    movddup  32(%ebp), %xmm0
    mulpd    %xmm1,    %xmm0
    movddup  24(%ebp), %xmm1
    mulpd    8(%ebp),  %xmm1
    addsubpd %xmm0,    %xmm1
    movupd   %xmm1,    (%eax)
    movl     %ebp,      %esp
    popl     %ebp
    ret $32

Notes:
- mult2 is quite more efficient than mult1.
- Maybe there is a way to remove one asm instruction from mult2, the last
movapd, if the precedent instruction addsubpd is done on %xmm2 and %xmm0 and
other instruction registers are swapped.
- Maybe the struct ComplexS is badly designed.

For that code I have used the SS3 SIMD extension, that today is very common for
all kind of Intel compatible CPUs. Using the AVX2 ldc2 produces:

__D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex:
    vmovsd  20(%esp), %xmm0
    vmovsd  28(%esp), %xmm1
    vmovsd  4(%esp),  %xmm2
    vmovsd  12(%esp), %xmm3
    vmulsd  %xmm1,    %xmm3,  %xmm4
    vmulsd  %xmm0,    %xmm2,  %xmm5
    vsubsd  %xmm4,    %xmm5,  %xmm4
    vmovsd  %xmm4,    (%eax) 
    vmulsd  %xmm1,    %xmm2,  %xmm1
    vmulsd  %xmm0,    %xmm3,  %xmm0
    vaddsd  %xmm1,    %xmm0,  %xmm0
    vmovsd  %xmm0,    8(%eax)
    ret $32

__D12test_complex5mult2FNhG2dNhG2dZNhG2d:
    vpermilpd       $3, %xmm1, %xmm2
    vpermilpd       $1, %xmm0, %xmm3
    vmulpd       %xmm2, %xmm3, %xmm2
    vpbroadcastq %xmm1, %xmm1
    vmulpd       %xmm0, %xmm1, %xmm0
    vaddsubpd    %xmm2, %xmm0, %xmm0
    ret

If I add -vectorize-slp -vectorize-slp-aggressive to the compilation switches
the asm of mult1 gets a bit better:

__D12test_complex5mult1FxS3std7complex14__T7ComplexTdZ7ComplexxS3std7complex14__T7ComplexTdZ7ComplexZS3std7complex14__T7ComplexTdZ7Complex:
    vmovsd    20(%esp), %xmm0
    vmovsd    28(%esp), %xmm1
    vmovsd     4(%esp), %xmm2
    vmovsd    12(%esp), %xmm3
    vmulsd       %xmm1, %xmm2, %xmm4
    vmulsd       %xmm0, %xmm3, %xmm5
    vaddsd       %xmm4, %xmm5, %xmm4
    vmulsd       %xmm1, %xmm3, %xmm1
    vmulsd       %xmm0, %xmm2, %xmm0
    vsubsd       %xmm1, %xmm0, %xmm0
    vunpcklpd    %xmm4, %xmm0, %xmm0
    vmovupd      %xmm0, (%eax)
    ret $32

So I suggest to add to std.complex some high level SIMD code like mult2, that
gets compiled if the target CPU supports SS3 instructions.

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------