T[new]

Mon Aug 10 15:53:35 PDT 2009

dsimcha:

> import std.stdio, std.perf;
[...]
> Results:
> Direct:  226
> Indirect:  229

Thank you. Following your example I have tried to do a bit more realistic tests.

// test1.d, for D2, Phobos
import std.stdio: writeln;
import std.perf: PerformanceCounter;

final class TNew(T) {
    T* ptr;
    size_t length, capacity;

    this(size_t size) {
        this.ptr = (new T[size]).ptr;
        this.length = size;
        this.capacity = size;
    }

    ref T opIndex(size_t index) {
        return this.ptr[index];
    }
}

void main() {
    // increase N if you have a large L2 cache
    enum int N = 100_000;

    enum int NLOOP = 40_000;
    alias uint T;

    auto pc = new PerformanceCounter;
    auto a = new T[N];
    auto c = new TNew!T(N);

    pc.start;
    T* p = a.ptr;
    for (uint j; j < NLOOP; j++)
        for (uint i; i < N; i++)
            p[i]++;
    pc.stop;
    writeln("Direct: ", pc.milliseconds);

    pc.start;
    for (uint j; j < NLOOP; j++)
        for (uint i; i < N; i++)
            c[i]++;
    pc.stop;
    writeln("Class:  ", pc.milliseconds);

    pc.start;
    for (uint j; j < NLOOP; j++)
        for (uint i; i < N; i++)
            a[i]++;
    pc.stop;
    writeln("Array:  ", pc.milliseconds);
    writeln();

    pc.start;
    for (uint j; j < NLOOP; j++)
        for (uint i = N-1; i--; )
            p[i]++;
    pc.stop;
    writeln("Direct: ", pc.milliseconds);

    pc.start;
    for (uint j; j < NLOOP; j++)
        for (uint i = N-1; i--; )
            c[i]++;
    pc.stop;
    writeln("Class:  ", pc.milliseconds);

    pc.start;
    for (uint j; j < NLOOP; j++)
        for (uint i = N-1; i--; )
            a[i]++;
    pc.stop;
    writeln("Array:  ", pc.milliseconds);
}

Timings on Windows, DMD v2.031, 2 GHz Core2 PC (use a larger N if you have a large L2 CPU Cache):
Direct: 4654
Class:  4670
Array:  4095

Direct: 4650
Class:  4688
Array:  4089

It's curious how the "direct" version is slower than the normal dynamic array (this doesn't happen in LDC, see below).

-----------------------------

Then I've tried to create a version that can be compiled with LDC too (with Tango). The LDC that I have compiles D1 code, so I can't use ref T opIndex(). So I have created the following version, if you spot bugs or mistakes please tell me:

// test2.d, for D1, Phobos and Tango
version (Tango) {
    import tango.stdc.stdio: printf;
    import tango.time.StopWatch: StopWatch;
} else {
    import std.c.stdio: printf;
    import std.perf: PerformanceCounter;
}

final class TNew(T) {
    T* ptr;
    size_t length, capacity;

    this(size_t size) {
        this.ptr = (new T[size]).ptr;
        this.length = size;
        this.capacity = size;
    }

    T* refItem(size_t index) {
        return &(this.ptr[index]);
    }
}

void main() {
    // increase N if you have a large L2 cache
    const int N = 100_000;

    const int NLOOP = 40_000;
    alias uint T;

    version (Tango)
        StopWatch timer;
    else
        auto timer = new PerformanceCounter;
    int deltaTime;

    auto a = new T[N];
    auto c = new TNew!(T)(N);

    timer.start;
    T* p = a.ptr;
    for (uint j; j < NLOOP; j++)
        for (uint i; i < N; i++)
            p[i]++;
    version (Tango) {
        deltaTime = timer.microsec() / 1000;
    } else {
        timer.stop;
        deltaTime = cast(int)timer.milliseconds;
    }
    printf("Direct: %d\n", deltaTime);

    timer.start;
    for (uint j; j < NLOOP; j++)
        for (uint i; i < N; i++)
            (*c.refItem(i))++;
    version (Tango) {
        deltaTime = timer.microsec() / 1000;
    } else {
        timer.stop;
        deltaTime = cast(int)timer.milliseconds;
    }
    printf("Class:  %d\n", deltaTime);

    timer.start;
    for (uint j; j < NLOOP; j++)
        for (uint i; i < N; i++)
            a[i]++;
    version (Tango) {
        deltaTime = timer.microsec() / 1000;
    } else {
        timer.stop;
        deltaTime = cast(int)timer.milliseconds;
    }
    printf("Array:  %d\n", deltaTime);
    printf("\n");

    timer.start;
    for (uint j; j < NLOOP; j++)
        for (uint i = N-1; i--; )
            p[i]++;
    version (Tango) {
        deltaTime = timer.microsec() / 1000;
    } else {
        timer.stop;
        deltaTime = cast(int)timer.milliseconds;
    }
    printf("Direct: %d\n", deltaTime);

    timer.start;
    for (uint j; j < NLOOP; j++)
        for (uint i = N-1; i--; )
            (*c.refItem(i))++;
    version (Tango) {
        deltaTime = timer.microsec() / 1000;
    } else {
        timer.stop;
        deltaTime = cast(int)timer.milliseconds;
    }
    printf("Class:  %d\n", deltaTime);

    timer.start;
    for (uint j; j < NLOOP; j++)
        for (uint i = N-1; i--; )
            a[i]++;
    version (Tango) {
        deltaTime = timer.microsec() / 1000;
    } else {
        timer.stop;
        deltaTime = cast(int)timer.milliseconds;
    }
    printf("Array:  %d\n", deltaTime);
}

Timings on Windows:
Direct: 4654
Class:  4810
Array:  4086

Direct: 4665
Class:  4598
Array:  4084

Timings on a 32 bit Linux (ldc -O3 -release -inline):
Direct: 4280
Class:  5000
Array:  4280

Direct: 4280
Class:  4860
Array:  4270

If such tests are correct, then the new resizable arrays of D2 will indeed have some performance penality (something like 13%-17% in this test, but results may change with other array sizes).

Bye,
bearophile