[Issue 4393] Very good dotProduct
d-bugmail at puremagic.com
d-bugmail at puremagic.com
Thu Apr 28 18:58:57 PDT 2011
http://d.puremagic.com/issues/show_bug.cgi?id=4393
--- Comment #1 from bearophile_hugs at eml.cc 2011-04-28 18:55:10 PDT ---
Some code I have written for arrays of floats:
float dot(float[] a, float[] b) {
enum size_t UNROLL_MASK = 0b111;
assert(a.length == b.length, "dot(): the two array lengths differ.");
typeof(a[0]) tot = void;
auto a_ptr = a.ptr;
auto b_ptr = b.ptr;
assert((cast(size_t)a_ptr & cast(size_t)0b1111) == 0,
"dot(): the first array is not aligned to 16 bytes.");
assert((cast(size_t)b_ptr & cast(size_t)0b1111) == 0,
"dot(): the second array is not aligned to 16 bytes.");
size_t len = (a.length & (~UNROLL_MASK)) * a[0].sizeof;
if (len) {
asm {
mov EAX, a_ptr;
mov ECX, b_ptr;
mov EDI, len;
xorps XMM0, XMM0;
xorps XMM3, XMM3;
xor EDX, EDX;
align 8;
LOOP_START:
movaps XMM1, [EAX + EDX];
mulps XMM1, [ECX + EDX];
movaps XMM2, [EAX + EDX + 16];
mulps XMM2, [ECX + EDX + 16];
add EDX, 32;
addps XMM0, XMM1;
cmp EDI, EDX;
addps XMM3, XMM2;
jne LOOP_START;
addps XMM0, XMM3;
// XMM0[0] = XMM0[0] + XMM0[1] + XMM0[2] + XMM0[3]
movhlps XMM1, XMM0;
addps XMM0, XMM1;
pshufd XMM1, XMM0, 0b01_01_01_01;
addss XMM0, XMM1;
movss tot, XMM0;
}
} else
tot = 0.0;
size_t left = a.length & UNROLL_MASK;
for (size_t i = left; i > 0; i--)
tot += a[$ - i] * b[$ - i];
return tot;
}
And for arrays of doubles:
import std.c.stdio;
void main() {
double[] A = [0.7644108908809033, 0.96458177717869509,
0.51166055832523683, 0.098840360055908461,
0.40813780586233483, 0.32285857447088551,
0.59137950751990331, 0.59518178287473289];
double[] B = [0.28061374331187983, 0.85036446787626108,
0.52498124274748326, 0.84170745998075014,
0.55819169392258683, 0.62586351111477134,
0.43021720539707864, 0.652708603473523];
// >>> sum(a*b for a,b in zip(A, B))
// 2.4593435789602185
if (A.length % 4 != 0) throw new Error("");
double tot1 = void,
tot2 = void;
auto a_ptr = cast(double*)A.ptr;
auto b_ptr = cast(double*)B.ptr;
size_t len = A.length * double.sizeof;
asm {
mov EAX, a_ptr;
mov ECX, b_ptr;
mov EDI, len;
xorps XMM0, XMM0;
xorps XMM3, XMM3;
xor EDX, EDX;
align 8;
LOOP_START:
movapd XMM1, [EAX + EDX];
mulpd XMM1, [ECX + EDX];
movapd XMM2, [EAX + EDX + 16];
mulpd XMM2, [ECX + EDX + 16];
add EDX, 32;
addpd XMM0, XMM1;
cmp EDI, EDX;
addpd XMM3, XMM2;
jne LOOP_START;
addpd XMM0, XMM3; // XMM0 += XMM3
movhpd tot1, XMM0;
movlpd tot2, XMM0;
}
printf("%.15lf\n", tot1 + tot2);
}
See bug 5880 too.
--
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
More information about the Digitalmars-d-bugs
mailing list