Array assign
bearophile
bearophileHUGS at lycos.com
Fri Sep 18 17:57:59 PDT 2009
I've found that when lenght is small (about 50-250 4-byte items or less) the array filling operation is quite slow compared to a normal loop:
a[] = x;
So I suggest DMD frontend to inline little loop when a.length is small.
To further improve the a[]=x; I have tried to speed up the larger case too, so I've used the movntps instruction. See info:
http://www.gamedev.net/community/forums/topic.asp?topic_id=532112
http://www.sesp.cse.clrc.ac.uk/html/SoftwareTools/vtune/users_guide/mergedProjects/analyzer_ec/mergedProjects/reference_olh/mergedProjects/instructions/instruct32_hh/vc197.htm
The following is my attempt, my experience of x86 asm programming is minimal still (I'll try to learn more) so while this program seems to somehow work, it's a pessimization compared to a normal loop, so probably I've done several mistakes/bugs. Can someone help me improve the code? If the result is fast enough, something similar can be added to d runtime.
This array4Set() is designed for 4 bytes long items, but I can write something similar for other data sizes too.
version (Tango) {
import tango.stdc.stdio: printf;
import tango.stdc.time: clock, CLOCKS_PER_SEC;
import tango.math.Math: sqrt;
import tango.stdc.string: memset;
} else {
import std.c.stdio: printf;
import std.c.time: clock, CLOCKS_PER_SEC;
import std.math: sqrt;
import std.c.string: memset;
}
double myclock() {
return cast(double)clock() / CLOCKS_PER_SEC;
}
void array4Set(T)(T[] a, T value) {
static assert(T.sizeof == 4);
if (!a.length)
return;
auto a_ptr = a.ptr;
auto a_end = a_ptr + a.length;
// align pointer
size_t aux = ((cast(size_t)a_ptr + 15) & ~15);
auto n = cast(T*)aux;
while (a_ptr < n)
*a_ptr++ = value;
n = cast(T*)((cast(size_t)a_end) & ~15);
if (a_ptr < n && (a_end - a_ptr) >= 16) {
// Aligned case
asm {
mov ESI, a_ptr;
mov EDI, n;
movss XMM0, value; // XMM0 = value,value,value,value
shufps XMM0, XMM0, 0;
align 8;
LOOP:
add ESI, 64;
movntps [ESI+ 0-64], XMM0;
movntps [ESI+16-64], XMM0;
movntps [ESI+32-64], XMM0;
movntps [ESI+48-64], XMM0;
cmp ESI, EDI;
jb LOOP;
mov a_ptr, ESI;
}
}
// trailing ones
while (a_ptr < a_end)
*a_ptr++ = value;
}
T test(T, bool withLoop)(int len, int nloops) {
auto a = new T[len];
for (int i; i < nloops; i++) {
static if (withLoop)
for (int j; j < a.length; j++)
a[j] = T.init;
else
//a[] = T.init;
//memset(a.ptr, 0, a.length * T.sizeof);
array4Set(a, 0);
a[i % len] = i;
}
return a[0];
}
void show(float[] a) {
printf("[");
if (a.length > 1)
foreach (el; a[0 .. $-1])
printf("%.1f, ", el);
if (a.length)
printf("%.1f", a[$-1]);
printf("]\n");
}
void main() {
// small test
auto a = new float[20];
foreach (i, ref el; a)
el = i;
// [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
show(a);
array4Set(a[2 .. 2], 0.5f);
// [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
show(a);
array4Set(a[2 .. 9], 0.5f);
// [0, 1, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
show(a);
printf("---------------------\n\n");
auto lens = [2, 4, 5, 8, 15, 16, 25, 32, 50, 64, 100, 250, 256, 1000, 2048, 2500];
//auto lens = [1 << 12, 1 << 13, 1 << 15, 1 << 16, 1 << 17];
foreach (len; lens) {
int nloops = cast(int)(cast(double)60_000_000 / sqrt(cast(double)len));
auto t0 = myclock();
alias int T;
test!(T, true)(len, nloops);
auto t1 = myclock();
auto t2 = myclock();
test!(T, false)(len, nloops);
auto t3 = myclock();
printf("len=%d, nloops=%d, time with loop=%.3f, time without loop=%.3f, ratio=%.3f\n",
len, nloops, t1-t0, t3-t2, (t1-t0) / (t3-t2));
}
}
Bye,
bearophile
More information about the Digitalmars-d-learn
mailing list