memset and related things
bearophile
bearophileHUGS at lycos.com
Sun Sep 20 10:16:10 PDT 2009
I think this version is a bit better:
void memset4(T)(T[] a, T value=T.init) {
static assert (T.sizeof == 4);
static assert (size_t.sizeof == (T*).sizeof);
if (!a.length)
return;
auto a_ptr = a.ptr;
auto a_end = a_ptr + a.length;
// align pointer to 16 bytes, processing leading unaligned items
size_t a_end_trimmed = (cast(size_t)a_ptr + 15) & (~15);
while (cast(size_t)a_ptr < a_end_trimmed)
*a_ptr++ = value;
// ending pointer minus the last % 64 bytes
a_end_trimmed = cast(size_t)a_end & (~cast(size_t)63);
//printf("%d %d %d %u\n", a_ptr, a_end, a_end_trimmed);
//int counter1, counter2;
if (a_end_trimmed - cast(size_t)a_ptr > (200_000 * T.sizeof))
asm {
mov ESI, a_ptr;
mov EDI, a_end_trimmed;
//pxor XMM0, XMM0; // XMMO = value, value, value, value
// XMM0 = value,value,value,value
movss XMM0, value;
shufps XMM0, XMM0, 0;
align 8;
LOOP1: // writes 4 * 4 * 4 bytes each loop
//inc counter1;
add ESI, 64;
movntps [ESI+ 0-64], XMM0;
movntps [ESI+16-64], XMM0;
movntps [ESI+32-64], XMM0;
movntps [ESI+48-64], XMM0;
cmp ESI, EDI;
jb LOOP1;
mov a_ptr, ESI;
}
else if (a_end_trimmed - cast(size_t)a_ptr > 16)
asm {
mov ESI, a_ptr;
mov EDI, a_end_trimmed;
//pxor XMM0, XMM0; // XMMO = value, value, value, value
// XMM0 = value,value,value,value
movss XMM0, value;
shufps XMM0, XMM0, 0;
align 8;
LOOP2: // writes 4 * 4 * 4 bytes each loop
//inc counter2;
add ESI, 64;
movaps [ESI+ 0-64], XMM0;
movaps [ESI+16-64], XMM0;
movaps [ESI+32-64], XMM0;
movaps [ESI+48-64], XMM0;
cmp ESI, EDI;
jb LOOP2;
mov a_ptr, ESI;
}
//printf("counter1, counter2: %d %d\n", counter1, counter2);
// the last % 16 items
while (a_ptr < a_end)
*a_ptr++ = value;
}
Bye,
bearophile
More information about the Digitalmars-d
mailing list