Good dotProduct

Tue Jun 29 05:29:33 PDT 2010

This is the faster I've done, further loop unrolling is not useful (for arrays of about 1400 doubles it's about 2.9 times faster than std.numeric.dotProduct on a recent Celeron CPU):

double dot5(double[] a, double[] b) {
    size_t len = a.length;
    assert(len == b.length, "dot(): the two array lengths differ.");
    if (len == 0)
        return 0.0;
    assert(len % 4 == 0, "dot: length must be a multiple of 4");

    double tot = void;
    double* a_ptr = a.ptr;
    double* b_ptr = b.ptr;

    assert((cast(size_t)a_ptr & cast(size_t)0b1111) == 0,
           "dot(): the first array is not aligned to 16 bytes.");
    assert((cast(size_t)b_ptr & cast(size_t)0b1111) == 0,
           "dot(): the second array is not aligned to 16 bytes.");

    len = (len / 4) * 32;

    asm {
        mov EAX, a_ptr;
        mov ECX, b_ptr;
        mov EDI, len;
        xorpd XMM0, XMM0;
        xorpd XMM3, XMM3;
        xor EDX, EDX;

        align 8;
    LOOP_START:
        movapd XMM1, [EAX + EDX];
        mulpd XMM1,  [ECX + EDX];
        movapd XMM2, [EAX + EDX + 16];
        mulpd XMM2,  [ECX + EDX + 16];
        addpd XMM0, XMM1;
        add EDX, 32;
        addpd XMM3, XMM2;
        cmp EDI, EDX;
        jne LOOP_START;

        addpd XMM0, XMM3;

        // XMM0[0] += XMM0[1]
        movapd  XMM1, XMM0;
        unpckhpd XMM1, XMM1;
        addsd XMM0, XMM1;

        movsd tot, XMM0;
    }

    return tot;
}

Bye,
bearophile