memset and related things

bearophile bearophileHUGS at lycos.com
Sun Sep 20 10:16:10 PDT 2009


I think this version is a bit better:

void memset4(T)(T[] a, T value=T.init) {
    static assert (T.sizeof == 4);
    static assert (size_t.sizeof == (T*).sizeof);
    if (!a.length)
        return;
    auto a_ptr = a.ptr;
    auto a_end = a_ptr + a.length;

    // align pointer to 16 bytes, processing leading unaligned items
    size_t a_end_trimmed = (cast(size_t)a_ptr + 15) & (~15);
    while (cast(size_t)a_ptr < a_end_trimmed)
        *a_ptr++ = value;

    // ending pointer minus the last % 64 bytes
    a_end_trimmed = cast(size_t)a_end & (~cast(size_t)63);

    //printf("%d %d %d %u\n", a_ptr, a_end, a_end_trimmed);
    //int counter1, counter2;

    if (a_end_trimmed - cast(size_t)a_ptr > (200_000 * T.sizeof))
        asm {
            mov ESI, a_ptr;
            mov EDI, a_end_trimmed;

            //pxor XMM0, XMM0; // XMMO = value, value, value, value
            // XMM0 = value,value,value,value
            movss XMM0, value;
            shufps XMM0, XMM0, 0;

            align 8;
            LOOP1: // writes 4 * 4 * 4 bytes each loop
                //inc counter1;
                add ESI, 64;
                movntps [ESI+ 0-64], XMM0;
                movntps [ESI+16-64], XMM0;
                movntps [ESI+32-64], XMM0;
                movntps [ESI+48-64], XMM0;
                cmp ESI, EDI;
            jb LOOP1;

            mov a_ptr, ESI;
        }
    else if (a_end_trimmed - cast(size_t)a_ptr > 16)
        asm {
            mov ESI, a_ptr;
            mov EDI, a_end_trimmed;

            //pxor XMM0, XMM0; // XMMO = value, value, value, value
            // XMM0 = value,value,value,value
            movss XMM0, value;
            shufps XMM0, XMM0, 0;

            align 8;
            LOOP2: // writes 4 * 4 * 4 bytes each loop
                //inc counter2;
                add ESI, 64;
                movaps [ESI+ 0-64], XMM0;
                movaps [ESI+16-64], XMM0;
                movaps [ESI+32-64], XMM0;
                movaps [ESI+48-64], XMM0;
                cmp ESI, EDI;
            jb LOOP2;

            mov a_ptr, ESI;
        }

    //printf("counter1, counter2: %d %d\n", counter1, counter2);

    // the last % 16 items
    while (a_ptr < a_end)
        *a_ptr++ = value;
}


Bye,
bearophile



More information about the Digitalmars-d mailing list