How to tune numerical D? (matrix multiplication is faster in g++ vs gdc)

Sun Mar 3 19:48:44 PST 2013

Dear D pros,

As a fan of D, I was hoping to be able to get similar results as 
this fellow on stack overflow, by noting his tuning steps;
http://stackoverflow.com/questions/5142366/how-fast-is-d-compared-to-c

Sadly however, when I pull out a simple matrix multiplication 
benchmark from the old language shootout (back when it had D), it 
is disturbingly slower in D when pit against C++.

Details? I ran with very recent gdc (gcc 4.7.2, gdc on the 4.7.2 
branch, pullreq #51, commit 
b8f5c22b0e7afa7e68a287ed788597e783540063), and the exact same gcc 
c++ compiler.

How would I tune this to be more competitive?  I'm comparing gdc 
vs g++ both built using the exact same gcc-4.7.2 back end, so it 
has to be something in the front end.  I've disabled GC after the 
matrices are made in D, so that doesn't explain it.

What is going on?  I'm hoping I'm making a silly, naive, obvious 
beginner mistake, but could that be?  I'm not sure how to apply 
the 'in' argument advice given on stackoverflow; if that is the 
answer, could someone summarise the best practice for 'in' use?

Thank you!

- J

$ g++ --version #shows: g++ (GCC) 4.7.2
$ uname -a
Linux gofast 2.6.35-24-generic #42-Ubuntu SMP Thu Dec 2 02:41:37 
UTC 2010 x86_64 GNU/Linux

# first, g++, two runs:

$ g++  -O3 matrix.cpp -ocppmatrix
$ time ./cppmatrix
-1015380632 859379360 -367726792 -1548829944

real    1m31.941s
user    1m31.920s
sys 0m0.010s
$ time ./cppmatrix
-1015380632 859379360 -367726792 -1548829944

real    1m32.068s
user    1m32.010s
sys 0m0.050s

# second, gdc, two runs:

$ gdmd -O -inline -release -noboundscheck -m64 matrix.d -ofdmatrix
$ time ./dmatrix
-1015380632 859379360 -367726792 -1548829944

real    2m10.677s
user    2m10.650s
sys 0m0.020s
$
$ time ./dmatrix
-1015380632 859379360 -367726792 -1548829944

real    2m12.664s
user    2m12.600s
sys 0m0.030s

# SIZE = 2000 results:

# It appears D (gdc) is 30% slower that C++ (g++); using the 
exact same backend compiler.

# it doesn't even appear to help to request O3 directly: it goes 
slower--

$ gdmd -O -q,-O3 -inline -release -noboundscheck -m64 matrix.d 
-ofdmatrix
$ time ./dmatrix
-1015380632 859379360 -367726792 -1548829944

real    2m17.107s
user    2m17.080s
sys 0m0.020s
jaten at afarm:~/tmp$

# Though still beating java, but not by much. (Java code not 
shown; it's same source as all of these; the historical 
http://shootout.alioth.debian.org/ code from when D was in the 
shootout.)

$ time java matrix
-1015380632 859379360 -367726792 -1548829944

real    2m23.739s
user    2m23.650s
sys 0m0.130s
$

Slightly bigger matrix?

SIZE = 2500 results: 25% slower in D

$ time ./cpp.O3.matrix
-1506465222 -119774408 -1600478274 1285663906

real    3m1.340s
user    3m1.290s
sys 0m0.040s

$ time ./dmatrix
-1506465222 -119774408 -1600478274 1285663906

real    4m2.109s
user    4m2.050s
sys 0m0.050s

//////// D version

import core.memory;

import std.stdio, std.string, std.array, std.conv;

const int SIZE = 2000;

int main(string[] args)
{
     int i, n = args.length > 1 ? to!int(args[1]) : 1;

     int[][] m1 = mkmatrix(SIZE,SIZE);
     int[][] m2 = mkmatrix(SIZE,SIZE);
     int[][] mm = mkmatrix(SIZE,SIZE);

     GC.disable;

     for (i=0; i<n; i++) {
         mmult(m1, m2, mm);
     }

     writefln("%d %d %d %d",mm[0][0],mm[2][3],mm[3][2],mm[4][4]);

     return 0;
}

int[][] mkmatrix(int rows, int cols)
{
     int[][] m;
     int count = 1;

     m.length = rows;
     foreach(ref int[] mi; m)
     {
         mi.length = cols;
         foreach(ref int mij; mi)
         {
             mij = count++;
         }
     }

     return(m);
}

void mmult(int[][] m1, int[][] m2, int[][] m3)
{
     foreach(int i, int[] m1i; m1)
     {
         foreach(int j, ref int m3ij; m3[i])
         {
             int val;
             foreach(int k, int[] m2k; m2)
             {
                 val += m1i[k] * m2k[j];
             }
             m3ij = val;
         }
     }
}

////// C++ version

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#define SIZE 2000

int **mkmatrix(int rows, int cols) {
     int i, j, count = 1;
     int **m = (int **) malloc(rows * sizeof(int *));
     for (i=0; i<rows; i++) {
     m[i] = (int *) malloc(cols * sizeof(int));
     for (j=0; j<cols; j++) {
         m[i][j] = count++;
     }
     }
     return(m);
}

void zeromatrix(int rows, int cols, int **m) {
     int i, j;
     for (i=0; i<rows; i++)
     for (j=0; j<cols; j++)
         m[i][j] = 0;
}

void freematrix(int rows, int **m) {
     while (--rows > -1) { free(m[rows]); }
     free(m);
}

int **mmult(int rows, int cols, int **m1, int **m2, int **m3) {
     int i, j, k, val;
     for (i=0; i<rows; i++) {
     for (j=0; j<cols; j++) {
         val = 0;
         for (k=0; k<cols; k++) {
         val += m1[i][k] * m2[k][j];
         }
         m3[i][j] = val;
     }
     }
     return(m3);
}

int main(int argc, char *argv[]) {
     int i, n = ((argc == 2) ? atoi(argv[1]) : 1);

     int **m1 = mkmatrix(SIZE, SIZE);
     int **m2 = mkmatrix(SIZE, SIZE);
     int **mm = mkmatrix(SIZE, SIZE);

     for (i=0; i<n; i++) {
     mm = mmult(SIZE, SIZE, m1, m2, mm);
     }
     printf("%d %d %d %d\n", mm[0][0], mm[2][3], mm[3][2], 
mm[4][4]);

     freematrix(SIZE, m1);
     freematrix(SIZE, m2);
     freematrix(SIZE, mm);
     return(0);
}