memset and related things
bearophile
bearophileHUGS at lycos.com
Sat Sep 19 17:18:43 PDT 2009
In a program I've seen that in the inner loop an array cleaning was taking too much time. To solve the problem I've done many experiments, and I've also produced the following testing program.
The short summary is, to set array of 4 byte integers to a certain constant the best was are:
- if len <~ 20, then just use an inlined loop.
- if 20 < len < 200_000 it's better to use a loop unrolled 4 times with the movaps instruction (8 times unrolled is a little worse).
- if n > 200_000 a loop with the movntps instruction is better.
Generally such solutions are better than the memset() (only when len is about 150_000 memset is a bit better than four movaps).
See also:
http://www.gamedev.net/community/forums/topic.asp?topic_id=532112
http://www.sesp.cse.clrc.ac.uk/html/SoftwareTools/vtune/users_guide/mergedProjects/analyzer_ec/mergedProjects/reference_olh/mergedProjects/instructions/instruct32_hh/vc197.htm
This is a possible function that can be used if a.length > 20, otherwise an inlined loop is faster:
void memset4(T)(T[] a, T value) {
// to be used for len(a) >~ 20
static assert(T.sizeof == 4);
if (!a.length)
return;
auto a_ptr = a.ptr;
auto a_end = a_ptr + a.length;
// align pointer
size_t aux = ((cast(size_t)a_ptr + 15) & ~15);
auto n = cast(T*)aux;
while (a_ptr < n)
*a_ptr++ = value;
n = cast(T*)((cast(size_t)a_end) & ~15);
if (a_ptr < n && (a_end - a_ptr) >= 16) {
// Aligned case
if ((a_end - a_ptr) >= 200_000) {
asm {
mov ESI, a_ptr;
mov EDI, n;
movss XMM0, value; // XMM0 = value,value,value,value
shufps XMM0, XMM0, 0;
align 8;
LOOP1:
add ESI, 64;
movntps [ESI+ 0-64], XMM0;
movntps [ESI+16-64], XMM0;
movntps [ESI+32-64], XMM0;
movntps [ESI+48-64], XMM0;
cmp ESI, EDI;
jb LOOP1;
mov a_ptr, ESI;
}
} else {
asm {
mov ESI, a_ptr;
mov EDI, n;
movss XMM0, value; // XMM0 = value,value,value,value
shufps XMM0, XMM0, 0;
align 8;
LOOP2:
add ESI, 64;
movaps [ESI+ 0-64], XMM0;
movaps [ESI+16-64], XMM0;
movaps [ESI+32-64], XMM0;
movaps [ESI+48-64], XMM0;
cmp ESI, EDI;
jb LOOP2;
mov a_ptr, ESI;
}
}
}
// trailing ones
while (a_ptr < a_end)
*a_ptr++ = value;
}
So the D front end can implement:
int[] a = ...;
a[] = k;
as:
int[] a = ...;
if (a.length < 20)
for (int i; i < a.length; i++)
a[i] = k;
else
memset4(a, k);
Arrays of other types of values need a little different code. Today most CPUs support SSE, but in the uncommon situations where it's not available it can be used C memset() instead of memset4 (the inline for loop is useful anyway when you use C memset).
I am ignorant of SSE asm still, so the code I have written can contain bugs, naive things, etc.
If such code can be debugged, and my timings are meaningful, then the code can be put inside the d front-end, so LDC too can use it (another good thing is for LLVM to improve its memset intrinsic, so LDC front-end doesn't need to perform such thing).
----------------------------------------
// benchmark code
version (Tango) {
import tango.stdc.stdio: printf;
import tango.stdc.time: clock, CLOCKS_PER_SEC;
import tango.math.Math: sqrt;
import tango.stdc.string: memset;
} else {
import std.c.stdio: printf;
import std.c.time: clock, CLOCKS_PER_SEC;
import std.math: sqrt;
import std.c.string: memset;
}
double myclock() {
return cast(double)clock() / CLOCKS_PER_SEC;
}
void memset4_movaps(T)(T[] a, T value) {
// to be used if a.length >~ 20, otherwise an inlined loop is faster
static assert(T.sizeof == 4);
if (!a.length)
return;
auto a_ptr = a.ptr;
auto a_end = a_ptr + a.length;
// align pointer
size_t aux = ((cast(size_t)a_ptr + 15) & ~15);
auto n = cast(T*)aux;
while (a_ptr < n)
*a_ptr++ = value;
n = cast(T*)((cast(size_t)a_end) & ~15);
if (a_ptr < n && (a_end - a_ptr) >= 16) {
// Aligned case
asm {
mov ESI, a_ptr;
mov EDI, n;
movss XMM0, value; // XMM0 = value,value,value,value
shufps XMM0, XMM0, 0;
align 8;
LOOP2:
add ESI, 64;
movaps [ESI+ 0-64], XMM0;
movaps [ESI+16-64], XMM0;
movaps [ESI+32-64], XMM0;
movaps [ESI+48-64], XMM0;
cmp ESI, EDI;
jb LOOP2;
mov a_ptr, ESI;
}
}
// trailing ones
while (a_ptr < a_end)
*a_ptr++ = value;
}
void memset4_movntps(T)(T[] a, T value) {
// to be used if a.length >~ 25, otherwise an inlined loop is faster
static assert(T.sizeof == 4);
if (!a.length)
return;
auto a_ptr = a.ptr;
auto a_end = a_ptr + a.length;
// align pointer
size_t aux = ((cast(size_t)a_ptr + 15) & ~15);
auto n = cast(T*)aux;
while (a_ptr < n)
*a_ptr++ = value;
n = cast(T*)((cast(size_t)a_end) & ~15);
if (a_ptr < n && (a_end - a_ptr) >= 16) {
// Aligned case
asm {
mov ESI, a_ptr;
mov EDI, n;
movss XMM0, value; // XMM0 = value,value,value,value
shufps XMM0, XMM0, 0;
align 8;
LOOP2:
add ESI, 64;
movntps [ESI+ 0-64], XMM0;
movntps [ESI+16-64], XMM0;
movntps [ESI+32-64], XMM0;
movntps [ESI+48-64], XMM0;
cmp ESI, EDI;
jb LOOP2;
mov a_ptr, ESI;
}
}
// trailing ones
while (a_ptr < a_end)
*a_ptr++ = value;
}
T test(T, int ntest)(int len, int nloops) {
auto a = new T[len];
for (int i; i < nloops; i++) {
static if (ntest == 0)
for (int j; j < a.length; j++)
a[j] = T.init;
else static if (ntest == 1)
memset(a.ptr, 0, a.length * T.sizeof);
else static if (ntest == 2)
memset4_movaps(a, 0);
else static if (ntest == 3)
memset4_movntps(a, 0);
a[i % len] = i;
}
return a[0];
}
void show(float[] a) {
printf("[");
if (a.length > 1)
foreach (el; a[0 .. $-1])
printf("%.1f, ", el);
if (a.length)
printf("%.1f", a[$-1]);
printf("]\n");
}
void main() {
// small test
auto a = new float[20];
foreach (i, ref el; a)
el = i;
// [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
show(a);
memset4_movaps(a[2 .. 2], 0.5f);
// [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
show(a);
memset4_movaps(a[2 .. 9], 0.5f);
// [0, 1, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
show(a);
printf("---------------------\n\n");
auto lens = [2, 4, 5, 8, 15, 16, 20, 25, 32, 50, 64, 100, 250, 256, 1000, 2048, 2500,
1 << 12, 1 << 15, 1 << 16, 1 << 17, 210_000, 1 << 18, 1 << 19, 1 << 20];
alias int T;
printf("len, nloops, time with loop, time with memset, time with movaps, time with movntps:\n");
foreach (len; lens) {
int nloops;
if (len >= 8000)
nloops = cast(int)(cast(double)4_000_000 / sqrt(cast(double)len));
else
nloops = cast(int)(cast(double)60_000_000 / sqrt(cast(double)len));
auto t0 = myclock();
//test!(T, 0)(len, nloops);
auto a2 = new T[len];
for (int i; i < nloops; i++) {
for (int j; j < a2.length; j++)
a2[j] = T.init;
a2[i % len] = i;
}
auto t1 = myclock();
auto t2 = myclock();
test!(T, 1)(len, nloops);
auto t3 = myclock();
auto t4 = myclock();
test!(T, 2)(len, nloops);
auto t5 = myclock();
auto t6 = myclock();
test!(T, 3)(len, nloops);
auto t7 = myclock();
printf("%8d %9d %.3f %.3f %.2f %.3f\n", len, nloops, t1-t0, t3-t2, t5-t4, t7-t6);
}
}
----------------------------------------
Timings taken on a Celeron at 2.13 MHz (other CPUs will give different results, I'll take the timings on a Core 2 too):
2 42426406 0.850 1.510 1.020 1.740
4 30000000 0.570 1.180 0.850 0.860
5 26832815 0.480 0.940 0.980 1.000
8 21213203 0.380 0.720 0.860 0.850
15 15491933 0.340 0.580 0.960 0.940
16 15000000 0.350 0.560 0.400 4.420
20 13416407 0.430 0.570 0.370 3.940
25 12000000 0.480 0.540 0.320 3.550
32 10606601 0.420 0.530 0.280 3.140
50 8485281 0.500 0.530 0.360 2.450
64 7500000 0.560 0.510 0.320 2.220
100 6000000 0.690 0.490 0.290 2.020
250 3794733 1.020 0.470 0.280 2.140
256 3750000 0.990 0.460 0.260 2.080
1000 1897366 2.070 0.640 0.340 3.460
2048 1325825 2.890 0.880 0.470 4.750
2500 1200000 3.240 0.920 0.500 5.300
4096 937500 4.070 1.150 0.630 6.660
32768 22097 0.570 0.320 0.250 1.300
65536 15625 0.850 0.450 0.380 1.760
131072 11048 1.630 1.150 1.240 2.500
210000 8728 3.740 2.630 3.150 3.160
262144 7812 5.480 4.050 5.040 3.550
524288 5524 11.120 9.200 11.020 5.110
1048576 3906 15.850 13.190 15.820 7.210
Graph in PNG:
http://i37.tinypic.com/v836e1.png
Bye,
bearophile
More information about the Digitalmars-d
mailing list