Matrix mul
bearophile
bearophileHUGS at lycos.com
Sun Nov 23 00:33:24 PST 2008
Tom S:
> Nah, it's about NaNs :)
> Version #1 initializes C to NaN, Version #2 initializes it to 0. The
> 'init mats randomly' loop doesn't touch C at all, thus all the latter
> additions leave C at NaN, causing lots of FP exceptions.
You are right, and I'm a stupid. Most of my benchmarks have similar silly bugs at the beginning. Thank you.
The new code:
import std.conv: toInt;
import std.c.stdlib: rand, malloc;
const int RAND_MAX = short.max; // RAND_MAX seems missing from std.c.stdlib
T[][] NewVoidCMatrix(T)(int nr, int nc) {
// Part of the code by Marius Muja <mariusm at cs.ubc.ca>
assert(nr > 0, "NewVoidCMatrix: nr must be > 0.");
assert(nc > 0, "NewVoidCMatrix: nc must be > 0.");
void* mem = cast(void*)malloc(nr * (T[]).sizeof + nr * nc * T.sizeof);
T[]* index = cast(T[]*)mem;
T* mat = cast(T*)(mem + nr * (T[]).sizeof);
for (int i = 0; i < nr; ++i) {
index[i] = mat[0 .. nc];
mat += nc;
}
return index[0 .. nr];
}
void main(string[] args) {
int i, j, k;
int N = args.length == 2 ? toInt(args[1]) : 400;
// alloc A, B, C
static if (0) {
// Version #1
auto A = new double[][](N, N);
auto B = new double[][](N, N);
auto C = new double[][](N, N);
} static if (0) {
// Version #2
auto A = NewVoidCMatrix!(double)(N, N);
auto B = NewVoidCMatrix!(double)(N, N);
auto C = NewVoidCMatrix!(double)(N, N);
} static if (1) {
// Version #3
double** A = cast(double**)malloc(N * (double*).sizeof);
for (i = 0; i < N; i++)
A[i] = cast(double*)malloc(N * (double).sizeof);
double** B = cast(double**)malloc(N * (double*).sizeof);
for (i = 0; i < N; i++)
B[i] = cast(double*)malloc(N * (double).sizeof);
double** C = cast(double**)malloc(N * (double*).sizeof);
for (i = 0; i < N; i++)
C[i] = cast(double*)malloc(N * (double).sizeof);
} static if (0) {
// Version #4
double** A = cast(double**)malloc(N * (double*).sizeof);
A[0] = cast(double*)malloc(N * N * (double).sizeof);
for(i = 1; i < N; i++)
A[i] = A[0] + i * N;
double** B = cast(double**)malloc(N * (double*).sizeof);
B[0] = cast(double*)malloc(N * N * (double).sizeof);
for(i = 1; i < N; i++)
B[i] = B[0] + i * N;
double** C = cast(double**)malloc(N * (double*).sizeof);
C[0] = cast(double*)malloc(N * N * (double).sizeof);
for(i = 1; i < N; i++)
C[i] = C[0] + i * N;
}
// init mats randomly
for (i = 0; i < N; i++)
for (j = 0; j < N; j++) {
A[i][j] = cast(double)rand() / RAND_MAX;
B[i][j] = cast(double)rand() / RAND_MAX;
C[i][j] = 0.0;
}
// C = A * B, naive way
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
for (k = 0; k < N; k++)
C[i][k] += A[i][j] * B[j][k];
}
I have added two more versions for Don, to show the difference of speed relative to spreading of memory caused by the malloc. And it's visible but not significant.
Timings are, n=400:
V1: 0.42 s
V2: 0.41 s (repeatable difference of 0.01 s)
V3: 0.25 s
V4: 0.25 s
Now the difference in speed has to come from something else. It may come from the DMD not optimizing dynamic arrays usage as well as C arrays.
A test I haven't done yet is to change the representation of the dynamic arrays: instead of allocating an array of structs that contain len+ptr, I can allocate two "parallel arrays", one with only len and the other with just the pointers. I think this may increase the locality of reference of the pointers, maybe leading to performance similar to the C arrays.
Bye,
bearophile
More information about the Digitalmars-d
mailing list