Matrix mul
bearophile
bearophileHUGS at lycos.com
Sat Nov 22 03:16:55 PST 2008
While writing code that works on matrices I have found something curious, so I have written the following little benchmark. As usual keep eyes open for possible bugs and mistakes of mine:
import std.conv: toInt;
import std.c.stdlib: rand, malloc;
const int RAND_MAX = short.max; // RAND_MAX seems missing from std.c.stdlib
void main(string[] args) {
int i, j, k;
int N = args.length == 2 ? toInt(args[1]) : 400;
// alloc A, B, C
static if (1) {
// Version #1
auto A = new double[][](N, N);
auto B = new double[][](N, N);
auto C = new double[][](N, N);
} else {
// Version #2
double** A = cast(double**)malloc(N * (double*).sizeof);
for (i = 0; i < N; i++)
A[i] = cast(double*)malloc(N * (double).sizeof);
double** B = cast(double**)malloc(N * (double*).sizeof);
for (i = 0; i < N; i++)
B[i] = cast(double*)malloc(N * (double).sizeof);
double** C = cast(double**)malloc(N * (double*).sizeof);
for (i = 0; i < N; i++)
C[i] = cast(double*)malloc(N * (double).sizeof);
}
// init mats randomly
for (i = 0; i < N; i++)
for (j = 0; j < N; j++) {
A[i][j] = cast(double)rand() / RAND_MAX;
B[i][j] = cast(double)rand() / RAND_MAX;
}
// C = A * B, naive way
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
for (k = 0; k < N; k++)
C[i][k] += A[i][j] * B[j][k];
}
Timings, Core2, 2 GHz, DMD V.1.036:
N = 100:
Version 1: 0.17 s
Version 2: 0.03 s
N = 400:
Version 1: 9.25 s
Version 2: 0.26 s
N = 512:
Version 1: 19.3 s
Version 2: 1.0 s
--------------------------
Note there are much faster ways to perform A * B:
// Alternative mul code
double[] bj = new double[N];
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++)
bj[k] = B[k][j];
for (int i = 0; i < N; i++) {
double[] ai = A[i];
double s = 0;
for (int k = 0; k < N; k++)
s += ai[k] * bj[k];
C[i][j] = s;
}
}
--------------------------------
Inner loop asm, version #1:
L164:
mov ECX,[EAX]
fld qword ptr [ESI*8][ECX]
mov ECX,0[EBP]
fmul qword ptr [EBX*8][ECX]
mov ECX,[EDX]
fadd qword ptr [EBX*8][ECX]
fstp qword ptr [EBX*8][ECX]
inc EBX
cmp EBX,EDI
jl L164
Inner loop asm, version #2:
L12B:
mov EDX,4[EBX]
mov EAX,[EBX]
fld qword ptr [EDI*8][EDX]
mov ECX,010h[ESP]
mov EDX,4[ECX]
mov EAX,[ECX]
fmul qword ptr [ESI*8][EDX]
mov ECX,014h[ESP]
mov EDX,4[ECX]
mov EAX,[ECX]
fadd qword ptr [ESI*8][EDX]
fstp qword ptr [ESI*8][EDX]
inc ESI
cmp ESI,EBP
jl L12B
Note: compiling the alternative mul code written in C with GCC the running speed is about twice still compared to the Version2.
Bye,
bearophile
More information about the Digitalmars-d
mailing list