Array append performance

bearophile bearophileHUGS at lycoc.com
Wed Aug 27 16:03:52 PDT 2008


Fawzi Mohamed:
> > memory ordering probably, looping from 0 to 7 is faster than 7 to 0...

I don't think that's the problem.


> so
...
> should be better...

It is not, it seems. For reference below some code you can run:


import std.c.string: memcpy;

template Tuple(T...) {
    alias T Tuple;
}

long timer() {
    asm {
        naked;
        push ECX;
        push EBX;
        mov EAX, 0;
        cpuid;
        pop EBX;
        pop ECX;
        rdtsc;
        ret;
    }
}

template Range(int stop) {
    static if (stop <= 0)
        alias Tuple!() Range;
    else
        alias Tuple!(Range!(stop-1), stop-1) Range;
}

void main() {
    const int N = 500_000_000;
    alias uint DataType;

    for (int i = 4; i < N; i *= 2) {
        auto b = new DataType[i];
        auto a = new DataType[i];

        a[] = b[]; // pre-cache

        auto count = N / i;
        auto start = timer();
        while (count--)
            a[] = b[];
        auto end = timer();
        auto t1 = end - start;

        count = N / i;
        start = timer();
        while (count--)
            memcpy(a.ptr, b.ptr, a.length * DataType.sizeof);
        end = timer();
        auto t2 = end - start;

        count = N / i;
        start = timer();
        while (count--)
            for (int j; j < i; j++)
                a[j] = b[j];
        end = timer();
        auto t3 = end - start;

        count = N / i;
        start = timer();
        while (count--) {
            auto pa = a.ptr;
            auto pb = b.ptr;
            int nloop = i;
            while (nloop--)
                *pa++ = *pb++;
        }
        end = timer();
        auto t4 = end - start;

        count = N / i;
        start = timer();
        while (count--) { // apparently the faster version
            switch (i) {
                case 0: break;
                case 1: foreach (j; Range!(1)) a[j] = b[j]; break;
                case 2: foreach (j; Range!(2)) a[j] = b[j]; break;
                case 3: foreach (j; Range!(3)) a[j] = b[j]; break;
                case 4: foreach (j; Range!(4)) a[j] = b[j]; break;
                case 5: foreach (j; Range!(5)) a[j] = b[j]; break;
                case 6: foreach (j; Range!(6)) a[j] = b[j]; break;
                case 7: foreach (j; Range!(7)) a[j] = b[j]; break;
                case 8: foreach (j; Range!(8)) a[j] = b[j]; break;
                default: memcpy(a.ptr, b.ptr, a.length * DataType.sizeof);
            }
        }
        end = timer();
        auto t5 = end - start;

        count = N / i;
        start = timer();
        while (count--) {
            int j = i;
            switch(i) {
                case 8: a[j] = b[j]; ++j;
                case 7: a[j] = b[j]; ++j;
                case 6: a[j] = b[j]; ++j;
                case 5: a[j] = b[j]; ++j;
                case 4: a[j] = b[j]; ++j;
                case 3: a[j] = b[j]; ++j;
                case 1: a[j] = b[j];
                case 0: break;
                default: memcpy( a.ptr, b.ptr, a.length * DataType.sizeof );
            }
        }
        end = timer();
        auto t6 = end - start;

        printf("i: %8d,\t[]=[]: %8lld, \tt1/t2: %.2f  \tt1/t3: %.2f  \tt1/t4: %.2f  \tt1/t5: %.2f  \tt1/t6: %.2f\n",
               i, t1, cast(float)t1/t2, cast(float)t1/t3, cast(float)t1/t4, cast(float)t1/t5,
               cast(float)t1/t6);
    }
}

Bye,
bearophile



More information about the Digitalmars-d mailing list