Array append performance

bearophile bearophileHUGS at lycos.com
Wed Aug 27 06:20:01 PDT 2008


Lionello Lunesu:
> Conclusion: memcpy wins big time for i <= 8192.

I have modified your code a little:

import std.c.string: memcpy;

long timer() {
    asm {
        naked;
        push ECX;
        push EBX;
        mov EAX, 0;
        cpuid;
        pop EBX;
        pop ECX;
        rdtsc;
        ret;
    }
}

void main() {
    const int N = 500_000_000;
    alias ubyte DataType;

    for (int i = 4; i < N; i *= 2) {
        auto b = new DataType[i];
        auto a = new DataType[i];

        a[] = b[]; // pre-cache

        auto count = N / i;
        auto start = timer();
        while (count--)
            a[] = b[];
        auto end = timer();
        auto t1 = end - start;

        count = N / i;
        start = timer();
        while (count--)
            memcpy(a.ptr, b.ptr, a.length * DataType.sizeof);
        end = timer();
        auto t2 = end - start;

        count = N / i;
        start = timer();
        while (count--)
            for (int j; j < i; j++)
                a[j] = b[j];
        end = timer();
        auto t3 = end - start;

        count = N / i;
        start = timer();
        while (count--) {
            auto pa = a.ptr;
            auto pb = b.ptr;
            int nloop = i;
            while (nloop--)
                *pa++ = *pb++;
        }
        end = timer();
        auto t4 = end - start;

        printf("i: %8d,\t[]=[]: %8lld, \tt1/t2: %.2f  \tt1/t3: %.2f  \tt1/t4: %.2f\n",
               i, t1, cast(float)t1/t2, cast(float)t1/t3, cast(float)t1/t4);
    }
}


The timings:

N = 500_000_000, DataType = ubyte:
i:        4,    []=[]: 9919922030,      t1/t2: 4.64     t1/t3: 5.64     t1/t4: 5.27
i:        8,    []=[]: 4957640530,      t1/t2: 4.63     t1/t3: 3.59     t1/t4: 2.93
i:       16,    []=[]: 2478916670,      t1/t2: 4.10     t1/t3: 2.08     t1/t4: 1.55
i:       32,    []=[]: 1537003970,      t1/t2: 2.28     t1/t3: 1.40     t1/t4: 0.99
i:       64,    []=[]: 1019418780,      t1/t2: 2.59     t1/t3: 0.97     t1/t4: 0.67
i:      128,    []=[]: 659281070,       t1/t2: 2.54     t1/t3: 0.60     t1/t4: 0.58
i:      256,    []=[]: 364781180,       t1/t2: 1.90     t1/t3: 0.35     t1/t4: 0.34
i:      512,    []=[]: 217207030,       t1/t2: 1.56     t1/t3: 0.21     t1/t4: 0.21
i:     1024,    []=[]: 143970290,       t1/t2: 1.37     t1/t3: 0.14     t1/t4: 0.14
i:     2048,    []=[]: 107283300,       t1/t2: 1.21     t1/t3: 0.11     t1/t4: 0.11
i:     4096,    []=[]: 88801730,        t1/t2: 1.11     t1/t3: 0.08     t1/t4: 0.08
i:     8192,    []=[]: 79605200,        t1/t2: 1.06     t1/t3: 0.07     t1/t4: 0.07
i:    16384,    []=[]: 77892400,        t1/t2: 0.99     t1/t3: 0.07     t1/t4: 0.07
i:    32768,    []=[]: 156926780,       t1/t2: 1.03     t1/t3: 0.16     t1/t4: 0.16
i:    65536,    []=[]: 154375350,       t1/t2: 1.01     t1/t3: 0.15     t1/t4: 0.15
i:   131072,    []=[]: 154758850,       t1/t2: 1.01     t1/t3: 0.15     t1/t4: 0.15
i:   262144,    []=[]: 155175500,       t1/t2: 1.01     t1/t3: 0.15     t1/t4: 0.15
i:   524288,    []=[]: 159647160,       t1/t2: 1.01     t1/t3: 0.16     t1/t4: 0.16
i:  1048576,    []=[]: 832590470,       t1/t2: 1.00     t1/t3: 0.79     t1/t4: 0.79
i:  2097152,    []=[]: 783795470,       t1/t2: 1.00     t1/t3: 0.76     t1/t4: 0.75
i:  4194304,    []=[]: 828801230,       t1/t2: 1.00     t1/t3: 0.79     t1/t4: 0.78
i:  8388608,    []=[]: 813813550,       t1/t2: 1.00     t1/t3: 0.78     t1/t4: 0.78
i: 16777216,    []=[]: 772143790,       t1/t2: 1.00     t1/t3: 0.77     t1/t4: 0.76
i: 33554432,    []=[]: 772579250,       t1/t2: 1.00     t1/t3: 0.78     t1/t4: 0.78
i: 67108864,    []=[]: 840918700,       t1/t2: 1.00     t1/t3: 0.84     t1/t4: 0.83
i: 134217728,   []=[]: 644072850,       t1/t2: 1.00     t1/t3: 0.77     t1/t4: 0.77
i: 268435456,   []=[]: 446295660,       t1/t2: 0.99     t1/t3: 0.79     t1/t4: 0.78

So for DataType = ubyte:
  i = 4: T3 better
  i in [8, 32768] T2 better
  i > 32768: T1 and T2 equally good


N = 500_000_000, DataType = uint:
i:        4,    []=[]: 9789704410,      t1/t2: 3.72     t1/t3: 6.00     t1/t4: 4.87
i:        8,    []=[]: 6088621920,      t1/t2: 2.25     t1/t3: 4.62     t1/t4: 3.46
i:       16,    []=[]: 4049677930,      t1/t2: 2.58     t1/t3: 3.31     t1/t4: 2.48
i:       32,    []=[]: 2620962840,      t1/t2: 2.53     t1/t3: 2.31     t1/t4: 1.67
i:       64,    []=[]: 1451602120,      t1/t2: 1.89     t1/t3: 1.39     t1/t4: 0.94
i:      128,    []=[]: 867937320,       t1/t2: 1.56     t1/t3: 0.79     t1/t4: 0.74
i:      256,    []=[]: 575548140,       t1/t2: 1.37     t1/t3: 0.55     t1/t4: 0.53
i:      512,    []=[]: 428628560,       t1/t2: 1.21     t1/t3: 0.42     t1/t4: 0.41
i:     1024,    []=[]: 355175800,       t1/t2: 1.11     t1/t3: 0.35     t1/t4: 0.35
i:     2048,    []=[]: 318880440,       t1/t2: 1.06     t1/t3: 0.32     t1/t4: 0.31
i:     4096,    []=[]: 307818740,       t1/t2: 1.02     t1/t3: 0.30     t1/t4: 0.30
i:     8192,    []=[]: 628989600,       t1/t2: 1.03     t1/t3: 0.62     t1/t4: 0.62
i:    16384,    []=[]: 618088340,       t1/t2: 1.01     t1/t3: 0.61     t1/t4: 0.61
i:    32768,    []=[]: 620081160,       t1/t2: 1.01     t1/t3: 0.62     t1/t4: 0.61
i:    65536,    []=[]: 622162940,       t1/t2: 1.01     t1/t3: 0.62     t1/t4: 0.61
i:   131072,    []=[]: 633820200,       t1/t2: 1.00     t1/t3: 0.61     t1/t4: 0.61
i:   262144,    []=[]: 3239626180,      t1/t2: 1.00     t1/t3: 1.05     t1/t4: 1.04
i:   524288,    []=[]: 3145529870,      t1/t2: 1.00     t1/t3: 1.04     t1/t4: 1.04
i:  1048576,    []=[]: 3319243820,      t1/t2: 1.00     t1/t3: 1.05     t1/t4: 1.05
i:  2097152,    []=[]: 3292952240,      t1/t2: 1.00     t1/t3: 1.04     t1/t4: 1.03
i:  4194304,    []=[]: 3159184090,      t1/t2: 1.00     t1/t3: 1.04     t1/t4: 1.04
i:  8388608,    []=[]: 3261583900,      t1/t2: 1.00     t1/t3: 1.04     t1/t4: 1.04
i: 16777216,    []=[]: 3462604700,      t1/t2: 1.00     t1/t3: 1.04     t1/t4: 1.03
i: 33554432,    []=[]: 2971217250,      t1/t2: 0.99     t1/t3: 1.05     t1/t4: 1.05
i: 67108864,    []=[]: 3136278270,      t1/t2: 0.99     t1/t3: 1.03     t1/t4: 1.03

So for DataType = uint:
  i in [4, 16]: T3 better
  i in [32, 8192] T2 better
  i > 16384: T1 and T2 equally good


N = 500_000_000, DataType = ulong:
i:        4,    []=[]: 12176650300,     t1/t2: 2.26     t1/t3: 5.67     t1/t4: 5.10
i:        8,    []=[]: 8095535110,      t1/t2: 2.58     t1/t3: 4.39     t1/t4: 3.69
i:       16,    []=[]: 5241679510,      t1/t2: 2.53     t1/t3: 3.15     t1/t4: 2.49
i:       32,    []=[]: 2904270580,      t1/t2: 1.89     t1/t3: 1.83     t1/t4: 1.41
i:       64,    []=[]: 1733200310,      t1/t2: 1.56     t1/t3: 1.12     t1/t4: 0.85
i:      128,    []=[]: 1149144250,      t1/t2: 1.37     t1/t3: 1.02     t1/t4: 0.70
i:      256,    []=[]: 856914580,       t1/t2: 1.21     t1/t3: 0.80     t1/t4: 0.54
i:      512,    []=[]: 711247630,       t1/t2: 1.11     t1/t3: 0.69     t1/t4: 0.46
i:     1024,    []=[]: 638308500,       t1/t2: 1.06     t1/t3: 0.63     t1/t4: 0.42
i:     2048,    []=[]: 614686480,       t1/t2: 0.99     t1/t3: 0.57     t1/t4: 0.40
i:     4096,    []=[]: 1254471740,      t1/t2: 1.02     t1/t3: 0.70     t1/t4: 0.69
i:     8192,    []=[]: 1238218710,      t1/t2: 1.01     t1/t3: 0.69     t1/t4: 0.68
i:    16384,    []=[]: 1240384550,      t1/t2: 1.01     t1/t3: 0.69     t1/t4: 0.68
i:    32768,    []=[]: 1244624520,      t1/t2: 1.01     t1/t3: 0.69     t1/t4: 0.69
i:    65536,    []=[]: 1272774050,      t1/t2: 1.00     t1/t3: 0.69     t1/t4: 0.69
i:   131072,    []=[]: 6532720530,      t1/t2: 1.00     t1/t3: 1.05     t1/t4: 1.05
i:   262144,    []=[]: 6315104980,      t1/t2: 1.00     t1/t3: 1.04     t1/t4: 1.04
i:   524288,    []=[]: 6663898750,      t1/t2: 1.00     t1/t3: 1.05     t1/t4: 1.05
i:  1048576,    []=[]: 6562860030,      t1/t2: 1.00     t1/t3: 1.04     t1/t4: 1.04
i:  2097152,    []=[]: 6322608930,      t1/t2: 1.00     t1/t3: 1.04     t1/t4: 1.04

So for DataType = ulong:
  i in [4, 16]: T3 better
  i in [32, 4096] T2 better
  i > 8192: T1 and T2 equally good


CPU used: Pentium Dual-Core E2180, with caches 64 KB L1 (32 KB data, 32 KB program) and 1 MB L2.

But there are ways to go much faster:
http://cdrom.amd.com/devconn/events/gdc_2002_amd.pdf
http://files.rsdn.ru/23380/AMD_block_prefetch_paper.pdf

Bye,
bearophile



More information about the Digitalmars-d mailing list