Good dotProduct
bearophile
bearophileHUGS at lycos.com
Tue Jun 29 05:29:33 PDT 2010
This is the faster I've done, further loop unrolling is not useful (for arrays of about 1400 doubles it's about 2.9 times faster than std.numeric.dotProduct on a recent Celeron CPU):
double dot5(double[] a, double[] b) {
size_t len = a.length;
assert(len == b.length, "dot(): the two array lengths differ.");
if (len == 0)
return 0.0;
assert(len % 4 == 0, "dot: length must be a multiple of 4");
double tot = void;
double* a_ptr = a.ptr;
double* b_ptr = b.ptr;
assert((cast(size_t)a_ptr & cast(size_t)0b1111) == 0,
"dot(): the first array is not aligned to 16 bytes.");
assert((cast(size_t)b_ptr & cast(size_t)0b1111) == 0,
"dot(): the second array is not aligned to 16 bytes.");
len = (len / 4) * 32;
asm {
mov EAX, a_ptr;
mov ECX, b_ptr;
mov EDI, len;
xorpd XMM0, XMM0;
xorpd XMM3, XMM3;
xor EDX, EDX;
align 8;
LOOP_START:
movapd XMM1, [EAX + EDX];
mulpd XMM1, [ECX + EDX];
movapd XMM2, [EAX + EDX + 16];
mulpd XMM2, [ECX + EDX + 16];
addpd XMM0, XMM1;
add EDX, 32;
addpd XMM3, XMM2;
cmp EDI, EDX;
jne LOOP_START;
addpd XMM0, XMM3;
// XMM0[0] += XMM0[1]
movapd XMM1, XMM0;
unpckhpd XMM1, XMM1;
addsd XMM0, XMM1;
movsd tot, XMM0;
}
return tot;
}
Bye,
bearophile
More information about the Digitalmars-d
mailing list