Array append performance
bearophile
bearophileHUGS at lycos.com
Wed Aug 27 06:20:01 PDT 2008
Lionello Lunesu:
> Conclusion: memcpy wins big time for i <= 8192.
I have modified your code a little:
import std.c.string: memcpy;
long timer() {
asm {
naked;
push ECX;
push EBX;
mov EAX, 0;
cpuid;
pop EBX;
pop ECX;
rdtsc;
ret;
}
}
void main() {
const int N = 500_000_000;
alias ubyte DataType;
for (int i = 4; i < N; i *= 2) {
auto b = new DataType[i];
auto a = new DataType[i];
a[] = b[]; // pre-cache
auto count = N / i;
auto start = timer();
while (count--)
a[] = b[];
auto end = timer();
auto t1 = end - start;
count = N / i;
start = timer();
while (count--)
memcpy(a.ptr, b.ptr, a.length * DataType.sizeof);
end = timer();
auto t2 = end - start;
count = N / i;
start = timer();
while (count--)
for (int j; j < i; j++)
a[j] = b[j];
end = timer();
auto t3 = end - start;
count = N / i;
start = timer();
while (count--) {
auto pa = a.ptr;
auto pb = b.ptr;
int nloop = i;
while (nloop--)
*pa++ = *pb++;
}
end = timer();
auto t4 = end - start;
printf("i: %8d,\t[]=[]: %8lld, \tt1/t2: %.2f \tt1/t3: %.2f \tt1/t4: %.2f\n",
i, t1, cast(float)t1/t2, cast(float)t1/t3, cast(float)t1/t4);
}
}
The timings:
N = 500_000_000, DataType = ubyte:
i: 4, []=[]: 9919922030, t1/t2: 4.64 t1/t3: 5.64 t1/t4: 5.27
i: 8, []=[]: 4957640530, t1/t2: 4.63 t1/t3: 3.59 t1/t4: 2.93
i: 16, []=[]: 2478916670, t1/t2: 4.10 t1/t3: 2.08 t1/t4: 1.55
i: 32, []=[]: 1537003970, t1/t2: 2.28 t1/t3: 1.40 t1/t4: 0.99
i: 64, []=[]: 1019418780, t1/t2: 2.59 t1/t3: 0.97 t1/t4: 0.67
i: 128, []=[]: 659281070, t1/t2: 2.54 t1/t3: 0.60 t1/t4: 0.58
i: 256, []=[]: 364781180, t1/t2: 1.90 t1/t3: 0.35 t1/t4: 0.34
i: 512, []=[]: 217207030, t1/t2: 1.56 t1/t3: 0.21 t1/t4: 0.21
i: 1024, []=[]: 143970290, t1/t2: 1.37 t1/t3: 0.14 t1/t4: 0.14
i: 2048, []=[]: 107283300, t1/t2: 1.21 t1/t3: 0.11 t1/t4: 0.11
i: 4096, []=[]: 88801730, t1/t2: 1.11 t1/t3: 0.08 t1/t4: 0.08
i: 8192, []=[]: 79605200, t1/t2: 1.06 t1/t3: 0.07 t1/t4: 0.07
i: 16384, []=[]: 77892400, t1/t2: 0.99 t1/t3: 0.07 t1/t4: 0.07
i: 32768, []=[]: 156926780, t1/t2: 1.03 t1/t3: 0.16 t1/t4: 0.16
i: 65536, []=[]: 154375350, t1/t2: 1.01 t1/t3: 0.15 t1/t4: 0.15
i: 131072, []=[]: 154758850, t1/t2: 1.01 t1/t3: 0.15 t1/t4: 0.15
i: 262144, []=[]: 155175500, t1/t2: 1.01 t1/t3: 0.15 t1/t4: 0.15
i: 524288, []=[]: 159647160, t1/t2: 1.01 t1/t3: 0.16 t1/t4: 0.16
i: 1048576, []=[]: 832590470, t1/t2: 1.00 t1/t3: 0.79 t1/t4: 0.79
i: 2097152, []=[]: 783795470, t1/t2: 1.00 t1/t3: 0.76 t1/t4: 0.75
i: 4194304, []=[]: 828801230, t1/t2: 1.00 t1/t3: 0.79 t1/t4: 0.78
i: 8388608, []=[]: 813813550, t1/t2: 1.00 t1/t3: 0.78 t1/t4: 0.78
i: 16777216, []=[]: 772143790, t1/t2: 1.00 t1/t3: 0.77 t1/t4: 0.76
i: 33554432, []=[]: 772579250, t1/t2: 1.00 t1/t3: 0.78 t1/t4: 0.78
i: 67108864, []=[]: 840918700, t1/t2: 1.00 t1/t3: 0.84 t1/t4: 0.83
i: 134217728, []=[]: 644072850, t1/t2: 1.00 t1/t3: 0.77 t1/t4: 0.77
i: 268435456, []=[]: 446295660, t1/t2: 0.99 t1/t3: 0.79 t1/t4: 0.78
So for DataType = ubyte:
i = 4: T3 better
i in [8, 32768] T2 better
i > 32768: T1 and T2 equally good
N = 500_000_000, DataType = uint:
i: 4, []=[]: 9789704410, t1/t2: 3.72 t1/t3: 6.00 t1/t4: 4.87
i: 8, []=[]: 6088621920, t1/t2: 2.25 t1/t3: 4.62 t1/t4: 3.46
i: 16, []=[]: 4049677930, t1/t2: 2.58 t1/t3: 3.31 t1/t4: 2.48
i: 32, []=[]: 2620962840, t1/t2: 2.53 t1/t3: 2.31 t1/t4: 1.67
i: 64, []=[]: 1451602120, t1/t2: 1.89 t1/t3: 1.39 t1/t4: 0.94
i: 128, []=[]: 867937320, t1/t2: 1.56 t1/t3: 0.79 t1/t4: 0.74
i: 256, []=[]: 575548140, t1/t2: 1.37 t1/t3: 0.55 t1/t4: 0.53
i: 512, []=[]: 428628560, t1/t2: 1.21 t1/t3: 0.42 t1/t4: 0.41
i: 1024, []=[]: 355175800, t1/t2: 1.11 t1/t3: 0.35 t1/t4: 0.35
i: 2048, []=[]: 318880440, t1/t2: 1.06 t1/t3: 0.32 t1/t4: 0.31
i: 4096, []=[]: 307818740, t1/t2: 1.02 t1/t3: 0.30 t1/t4: 0.30
i: 8192, []=[]: 628989600, t1/t2: 1.03 t1/t3: 0.62 t1/t4: 0.62
i: 16384, []=[]: 618088340, t1/t2: 1.01 t1/t3: 0.61 t1/t4: 0.61
i: 32768, []=[]: 620081160, t1/t2: 1.01 t1/t3: 0.62 t1/t4: 0.61
i: 65536, []=[]: 622162940, t1/t2: 1.01 t1/t3: 0.62 t1/t4: 0.61
i: 131072, []=[]: 633820200, t1/t2: 1.00 t1/t3: 0.61 t1/t4: 0.61
i: 262144, []=[]: 3239626180, t1/t2: 1.00 t1/t3: 1.05 t1/t4: 1.04
i: 524288, []=[]: 3145529870, t1/t2: 1.00 t1/t3: 1.04 t1/t4: 1.04
i: 1048576, []=[]: 3319243820, t1/t2: 1.00 t1/t3: 1.05 t1/t4: 1.05
i: 2097152, []=[]: 3292952240, t1/t2: 1.00 t1/t3: 1.04 t1/t4: 1.03
i: 4194304, []=[]: 3159184090, t1/t2: 1.00 t1/t3: 1.04 t1/t4: 1.04
i: 8388608, []=[]: 3261583900, t1/t2: 1.00 t1/t3: 1.04 t1/t4: 1.04
i: 16777216, []=[]: 3462604700, t1/t2: 1.00 t1/t3: 1.04 t1/t4: 1.03
i: 33554432, []=[]: 2971217250, t1/t2: 0.99 t1/t3: 1.05 t1/t4: 1.05
i: 67108864, []=[]: 3136278270, t1/t2: 0.99 t1/t3: 1.03 t1/t4: 1.03
So for DataType = uint:
i in [4, 16]: T3 better
i in [32, 8192] T2 better
i > 16384: T1 and T2 equally good
N = 500_000_000, DataType = ulong:
i: 4, []=[]: 12176650300, t1/t2: 2.26 t1/t3: 5.67 t1/t4: 5.10
i: 8, []=[]: 8095535110, t1/t2: 2.58 t1/t3: 4.39 t1/t4: 3.69
i: 16, []=[]: 5241679510, t1/t2: 2.53 t1/t3: 3.15 t1/t4: 2.49
i: 32, []=[]: 2904270580, t1/t2: 1.89 t1/t3: 1.83 t1/t4: 1.41
i: 64, []=[]: 1733200310, t1/t2: 1.56 t1/t3: 1.12 t1/t4: 0.85
i: 128, []=[]: 1149144250, t1/t2: 1.37 t1/t3: 1.02 t1/t4: 0.70
i: 256, []=[]: 856914580, t1/t2: 1.21 t1/t3: 0.80 t1/t4: 0.54
i: 512, []=[]: 711247630, t1/t2: 1.11 t1/t3: 0.69 t1/t4: 0.46
i: 1024, []=[]: 638308500, t1/t2: 1.06 t1/t3: 0.63 t1/t4: 0.42
i: 2048, []=[]: 614686480, t1/t2: 0.99 t1/t3: 0.57 t1/t4: 0.40
i: 4096, []=[]: 1254471740, t1/t2: 1.02 t1/t3: 0.70 t1/t4: 0.69
i: 8192, []=[]: 1238218710, t1/t2: 1.01 t1/t3: 0.69 t1/t4: 0.68
i: 16384, []=[]: 1240384550, t1/t2: 1.01 t1/t3: 0.69 t1/t4: 0.68
i: 32768, []=[]: 1244624520, t1/t2: 1.01 t1/t3: 0.69 t1/t4: 0.69
i: 65536, []=[]: 1272774050, t1/t2: 1.00 t1/t3: 0.69 t1/t4: 0.69
i: 131072, []=[]: 6532720530, t1/t2: 1.00 t1/t3: 1.05 t1/t4: 1.05
i: 262144, []=[]: 6315104980, t1/t2: 1.00 t1/t3: 1.04 t1/t4: 1.04
i: 524288, []=[]: 6663898750, t1/t2: 1.00 t1/t3: 1.05 t1/t4: 1.05
i: 1048576, []=[]: 6562860030, t1/t2: 1.00 t1/t3: 1.04 t1/t4: 1.04
i: 2097152, []=[]: 6322608930, t1/t2: 1.00 t1/t3: 1.04 t1/t4: 1.04
So for DataType = ulong:
i in [4, 16]: T3 better
i in [32, 4096] T2 better
i > 8192: T1 and T2 equally good
CPU used: Pentium Dual-Core E2180, with caches 64 KB L1 (32 KB data, 32 KB program) and 1 MB L2.
But there are ways to go much faster:
http://cdrom.amd.com/devconn/events/gdc_2002_amd.pdf
http://files.rsdn.ru/23380/AMD_block_prefetch_paper.pdf
Bye,
bearophile
More information about the Digitalmars-d
mailing list