T[new]
bearophile
bearophileHUGS at lycos.com
Mon Aug 10 15:53:35 PDT 2009
dsimcha:
> import std.stdio, std.perf;
[...]
> Results:
> Direct: 226
> Indirect: 229
Thank you. Following your example I have tried to do a bit more realistic tests.
// test1.d, for D2, Phobos
import std.stdio: writeln;
import std.perf: PerformanceCounter;
final class TNew(T) {
T* ptr;
size_t length, capacity;
this(size_t size) {
this.ptr = (new T[size]).ptr;
this.length = size;
this.capacity = size;
}
ref T opIndex(size_t index) {
return this.ptr[index];
}
}
void main() {
// increase N if you have a large L2 cache
enum int N = 100_000;
enum int NLOOP = 40_000;
alias uint T;
auto pc = new PerformanceCounter;
auto a = new T[N];
auto c = new TNew!T(N);
pc.start;
T* p = a.ptr;
for (uint j; j < NLOOP; j++)
for (uint i; i < N; i++)
p[i]++;
pc.stop;
writeln("Direct: ", pc.milliseconds);
pc.start;
for (uint j; j < NLOOP; j++)
for (uint i; i < N; i++)
c[i]++;
pc.stop;
writeln("Class: ", pc.milliseconds);
pc.start;
for (uint j; j < NLOOP; j++)
for (uint i; i < N; i++)
a[i]++;
pc.stop;
writeln("Array: ", pc.milliseconds);
writeln();
pc.start;
for (uint j; j < NLOOP; j++)
for (uint i = N-1; i--; )
p[i]++;
pc.stop;
writeln("Direct: ", pc.milliseconds);
pc.start;
for (uint j; j < NLOOP; j++)
for (uint i = N-1; i--; )
c[i]++;
pc.stop;
writeln("Class: ", pc.milliseconds);
pc.start;
for (uint j; j < NLOOP; j++)
for (uint i = N-1; i--; )
a[i]++;
pc.stop;
writeln("Array: ", pc.milliseconds);
}
Timings on Windows, DMD v2.031, 2 GHz Core2 PC (use a larger N if you have a large L2 CPU Cache):
Direct: 4654
Class: 4670
Array: 4095
Direct: 4650
Class: 4688
Array: 4089
It's curious how the "direct" version is slower than the normal dynamic array (this doesn't happen in LDC, see below).
-----------------------------
Then I've tried to create a version that can be compiled with LDC too (with Tango). The LDC that I have compiles D1 code, so I can't use ref T opIndex(). So I have created the following version, if you spot bugs or mistakes please tell me:
// test2.d, for D1, Phobos and Tango
version (Tango) {
import tango.stdc.stdio: printf;
import tango.time.StopWatch: StopWatch;
} else {
import std.c.stdio: printf;
import std.perf: PerformanceCounter;
}
final class TNew(T) {
T* ptr;
size_t length, capacity;
this(size_t size) {
this.ptr = (new T[size]).ptr;
this.length = size;
this.capacity = size;
}
T* refItem(size_t index) {
return &(this.ptr[index]);
}
}
void main() {
// increase N if you have a large L2 cache
const int N = 100_000;
const int NLOOP = 40_000;
alias uint T;
version (Tango)
StopWatch timer;
else
auto timer = new PerformanceCounter;
int deltaTime;
auto a = new T[N];
auto c = new TNew!(T)(N);
timer.start;
T* p = a.ptr;
for (uint j; j < NLOOP; j++)
for (uint i; i < N; i++)
p[i]++;
version (Tango) {
deltaTime = timer.microsec() / 1000;
} else {
timer.stop;
deltaTime = cast(int)timer.milliseconds;
}
printf("Direct: %d\n", deltaTime);
timer.start;
for (uint j; j < NLOOP; j++)
for (uint i; i < N; i++)
(*c.refItem(i))++;
version (Tango) {
deltaTime = timer.microsec() / 1000;
} else {
timer.stop;
deltaTime = cast(int)timer.milliseconds;
}
printf("Class: %d\n", deltaTime);
timer.start;
for (uint j; j < NLOOP; j++)
for (uint i; i < N; i++)
a[i]++;
version (Tango) {
deltaTime = timer.microsec() / 1000;
} else {
timer.stop;
deltaTime = cast(int)timer.milliseconds;
}
printf("Array: %d\n", deltaTime);
printf("\n");
timer.start;
for (uint j; j < NLOOP; j++)
for (uint i = N-1; i--; )
p[i]++;
version (Tango) {
deltaTime = timer.microsec() / 1000;
} else {
timer.stop;
deltaTime = cast(int)timer.milliseconds;
}
printf("Direct: %d\n", deltaTime);
timer.start;
for (uint j; j < NLOOP; j++)
for (uint i = N-1; i--; )
(*c.refItem(i))++;
version (Tango) {
deltaTime = timer.microsec() / 1000;
} else {
timer.stop;
deltaTime = cast(int)timer.milliseconds;
}
printf("Class: %d\n", deltaTime);
timer.start;
for (uint j; j < NLOOP; j++)
for (uint i = N-1; i--; )
a[i]++;
version (Tango) {
deltaTime = timer.microsec() / 1000;
} else {
timer.stop;
deltaTime = cast(int)timer.milliseconds;
}
printf("Array: %d\n", deltaTime);
}
Timings on Windows:
Direct: 4654
Class: 4810
Array: 4086
Direct: 4665
Class: 4598
Array: 4084
Timings on a 32 bit Linux (ldc -O3 -release -inline):
Direct: 4280
Class: 5000
Array: 4280
Direct: 4280
Class: 4860
Array: 4270
If such tests are correct, then the new resizable arrays of D2 will indeed have some performance penality (something like 13%-17% in this test, but results may change with other array sizes).
Bye,
bearophile
More information about the Digitalmars-d
mailing list