/*********************** * Computes: * a[] = b[] + c[] */ T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b) in { assert(a.length == b.length && b.length == c.length); assert(disjoint(a, b)); assert(disjoint(a, c)); assert(disjoint(b, c)); } body { auto aptr = a.ptr; auto aend = aptr + a.length; auto bptr = b.ptr; auto cptr = c.ptr; version (D_InlineAsm_X86) { // SSE2 version is 333% faster if (sse2() && b.length >= 16) { auto n = aptr + (b.length & ~15); /* // Unaligned case asm { mov EAX, bptr; // left operand mov ECX, cptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopb: movupd XMM0, [EAX]; movupd XMM1, [EAX+16]; movupd XMM2, [EAX+32]; movupd XMM3, [EAX+48]; add EAX, 64; movupd XMM4, [ECX]; movupd XMM5, [ECX+16]; movupd XMM6, [ECX+32]; movupd XMM7, [ECX+48]; add ESI, 64; addpd XMM0, XMM4; addpd XMM1, XMM5; addpd XMM2, XMM6; addpd XMM3, XMM7; add ECX, 64; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopb; mov aptr, ESI; mov bptr, EAX; mov cptr, ECX; } */ if (((cast(size_t) aptr | cast(size_t) bptr | cast(size_t) cptr) & 15) == 0) { // Aligned case asm { mov EAX, bptr; // left operand mov ECX, cptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopa: movapd XMM0, [EAX]; movapd XMM1, [EAX+16]; movapd XMM2, [EAX+32]; movapd XMM3, [EAX+48]; add EAX, 64; movapd XMM4, [ECX]; movapd XMM5, [ECX+16]; movapd XMM6, [ECX+32]; movapd XMM7, [ECX+48]; add ESI, 64; addpd XMM0, XMM4; addpd XMM1, XMM5; addpd XMM2, XMM6; addpd XMM3, XMM7; add ECX, 64; movapd [ESI+ 0-64], XMM0; movapd [ESI+16-64], XMM1; movapd [ESI+32-64], XMM2; movapd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopa; mov aptr, ESI; mov bptr, EAX; mov cptr, ECX; } } else { // Unaligned case asm { mov EAX, bptr; // left operand mov ECX, cptr; // right operand mov ESI, aptr; // destination operand mov EDI, n; // end comparison align 8; startsseloopu: movupd XMM0, [EAX]; movupd XMM1, [EAX+16]; movupd XMM2, [EAX+32]; movupd XMM3, [EAX+48]; add EAX, 64; movupd XMM4, [ECX]; movupd XMM5, [ECX+16]; movupd XMM6, [ECX+32]; movupd XMM7, [ECX+48]; add ESI, 64; addpd XMM0, XMM4; addpd XMM1, XMM5; addpd XMM2, XMM6; addpd XMM3, XMM7; add ECX, 64; movupd [ESI+ 0-64], XMM0; movupd [ESI+16-64], XMM1; movupd [ESI+32-64], XMM2; movupd [ESI+48-64], XMM3; cmp ESI, EDI; jb startsseloopu; mov aptr, ESI; mov bptr, EAX; mov cptr, ECX; } } } } // Handle remainder while (aptr < aend) *aptr++ = *bptr++ + *cptr++; return a; }