How to tune numerical D? (matrix multiplication is faster in g++ vs gdc)
J
private at private-dont-email-dont-spam.com
Sun Mar 3 19:48:44 PST 2013
Dear D pros,
As a fan of D, I was hoping to be able to get similar results as
this fellow on stack overflow, by noting his tuning steps;
http://stackoverflow.com/questions/5142366/how-fast-is-d-compared-to-c
Sadly however, when I pull out a simple matrix multiplication
benchmark from the old language shootout (back when it had D), it
is disturbingly slower in D when pit against C++.
Details? I ran with very recent gdc (gcc 4.7.2, gdc on the 4.7.2
branch, pullreq #51, commit
b8f5c22b0e7afa7e68a287ed788597e783540063), and the exact same gcc
c++ compiler.
How would I tune this to be more competitive? I'm comparing gdc
vs g++ both built using the exact same gcc-4.7.2 back end, so it
has to be something in the front end. I've disabled GC after the
matrices are made in D, so that doesn't explain it.
What is going on? I'm hoping I'm making a silly, naive, obvious
beginner mistake, but could that be? I'm not sure how to apply
the 'in' argument advice given on stackoverflow; if that is the
answer, could someone summarise the best practice for 'in' use?
Thank you!
- J
$ g++ --version #shows: g++ (GCC) 4.7.2
$ uname -a
Linux gofast 2.6.35-24-generic #42-Ubuntu SMP Thu Dec 2 02:41:37
UTC 2010 x86_64 GNU/Linux
# first, g++, two runs:
$ g++ -O3 matrix.cpp -ocppmatrix
$ time ./cppmatrix
-1015380632 859379360 -367726792 -1548829944
real 1m31.941s
user 1m31.920s
sys 0m0.010s
$ time ./cppmatrix
-1015380632 859379360 -367726792 -1548829944
real 1m32.068s
user 1m32.010s
sys 0m0.050s
# second, gdc, two runs:
$ gdmd -O -inline -release -noboundscheck -m64 matrix.d -ofdmatrix
$ time ./dmatrix
-1015380632 859379360 -367726792 -1548829944
real 2m10.677s
user 2m10.650s
sys 0m0.020s
$
$ time ./dmatrix
-1015380632 859379360 -367726792 -1548829944
real 2m12.664s
user 2m12.600s
sys 0m0.030s
# SIZE = 2000 results:
# It appears D (gdc) is 30% slower that C++ (g++); using the
exact same backend compiler.
# it doesn't even appear to help to request O3 directly: it goes
slower--
$ gdmd -O -q,-O3 -inline -release -noboundscheck -m64 matrix.d
-ofdmatrix
$ time ./dmatrix
-1015380632 859379360 -367726792 -1548829944
real 2m17.107s
user 2m17.080s
sys 0m0.020s
jaten at afarm:~/tmp$
# Though still beating java, but not by much. (Java code not
shown; it's same source as all of these; the historical
http://shootout.alioth.debian.org/ code from when D was in the
shootout.)
$ time java matrix
-1015380632 859379360 -367726792 -1548829944
real 2m23.739s
user 2m23.650s
sys 0m0.130s
$
Slightly bigger matrix?
SIZE = 2500 results: 25% slower in D
$ time ./cpp.O3.matrix
-1506465222 -119774408 -1600478274 1285663906
real 3m1.340s
user 3m1.290s
sys 0m0.040s
$ time ./dmatrix
-1506465222 -119774408 -1600478274 1285663906
real 4m2.109s
user 4m2.050s
sys 0m0.050s
//////// D version
import core.memory;
import std.stdio, std.string, std.array, std.conv;
const int SIZE = 2000;
int main(string[] args)
{
int i, n = args.length > 1 ? to!int(args[1]) : 1;
int[][] m1 = mkmatrix(SIZE,SIZE);
int[][] m2 = mkmatrix(SIZE,SIZE);
int[][] mm = mkmatrix(SIZE,SIZE);
GC.disable;
for (i=0; i<n; i++) {
mmult(m1, m2, mm);
}
writefln("%d %d %d %d",mm[0][0],mm[2][3],mm[3][2],mm[4][4]);
return 0;
}
int[][] mkmatrix(int rows, int cols)
{
int[][] m;
int count = 1;
m.length = rows;
foreach(ref int[] mi; m)
{
mi.length = cols;
foreach(ref int mij; mi)
{
mij = count++;
}
}
return(m);
}
void mmult(int[][] m1, int[][] m2, int[][] m3)
{
foreach(int i, int[] m1i; m1)
{
foreach(int j, ref int m3ij; m3[i])
{
int val;
foreach(int k, int[] m2k; m2)
{
val += m1i[k] * m2k[j];
}
m3ij = val;
}
}
}
////// C++ version
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#define SIZE 2000
int **mkmatrix(int rows, int cols) {
int i, j, count = 1;
int **m = (int **) malloc(rows * sizeof(int *));
for (i=0; i<rows; i++) {
m[i] = (int *) malloc(cols * sizeof(int));
for (j=0; j<cols; j++) {
m[i][j] = count++;
}
}
return(m);
}
void zeromatrix(int rows, int cols, int **m) {
int i, j;
for (i=0; i<rows; i++)
for (j=0; j<cols; j++)
m[i][j] = 0;
}
void freematrix(int rows, int **m) {
while (--rows > -1) { free(m[rows]); }
free(m);
}
int **mmult(int rows, int cols, int **m1, int **m2, int **m3) {
int i, j, k, val;
for (i=0; i<rows; i++) {
for (j=0; j<cols; j++) {
val = 0;
for (k=0; k<cols; k++) {
val += m1[i][k] * m2[k][j];
}
m3[i][j] = val;
}
}
return(m3);
}
int main(int argc, char *argv[]) {
int i, n = ((argc == 2) ? atoi(argv[1]) : 1);
int **m1 = mkmatrix(SIZE, SIZE);
int **m2 = mkmatrix(SIZE, SIZE);
int **mm = mkmatrix(SIZE, SIZE);
for (i=0; i<n; i++) {
mm = mmult(SIZE, SIZE, m1, m2, mm);
}
printf("%d %d %d %d\n", mm[0][0], mm[2][3], mm[3][2],
mm[4][4]);
freematrix(SIZE, m1);
freematrix(SIZE, m2);
freematrix(SIZE, mm);
return(0);
}
More information about the Digitalmars-d
mailing list