D code optimization

Thu Sep 22 13:04:39 PDT 2016

Hi,

Interesting question, so I took your examples and made them do 
the same thing with regards to allocation (using malloc instead 
of new in both languages).
I removed the stopwatch to use "time" instead.
Now the programs should do the very same thing. Will they be as 
fast too?

D code:

------------------------ bench.d

import std.stdio, std.math;
import core.stdc.stdlib;
import core.stdc.stdio;

int main() {

     double C=0.0;

     for (int k=0;k<10000;++k) { // iterate 1000x

         double S0 = 100.0;
         double r = 0.03;
         double alpha = 0.07;
         double sigma = 0.2;
         double T = 1.0;
         double strike = 100.0;
         double S = 0.0;

         const int n = 252;

         double dt = T / n;
         double R = exp(r*dt);

         double u = exp(alpha*dt + sigma*sqrt(dt));
         double d = exp(alpha*dt - sigma*sqrt(dt));

         double qU = (R - d) / (R*(u - d));
         double qD = (1 - R*qU) / R;

         double* call = cast(double*)malloc(double.sizeof * (n+1));

         for (int i = 0; i <= n; ++i)  call[i] = fmax(S0*pow(u, 
n-i)*pow(d, i)-strike, 0.0);

         for (int i = n-1; i >= 0 ; --i) {
             for (int j = 0; j <= i; ++j) {
                 call[j] = qU * call[j] + qD * call[j+1];
             }
         }

         C = call[0];
     }
     printf("%f\n", C);

     return 0;
}

------------------------

C++ code

------------------------ bench.cpp

#include <cmath>
#include <cstdlib>
#include <cstdio>

int main() {

     double C=0.0;

     for (int k=0;k<10000;++k) { // iterate 1000x

         double S0 = 100.0;
         double r = 0.03;
         double alpha = 0.07;
         double sigma = 0.2;
         double T = 1.0;
         double strike = 100.0;
         double S = 0.0;

         const int n = 252;

         double dt = T / n;
         double R = exp(r*dt);

         double u = exp(alpha*dt + sigma*sqrt(dt));
         double d = exp(alpha*dt - sigma*sqrt(dt));

         double qU = (R - d) / (R*(u - d));
         double qD = (1 - R*qU) / R;

         double* call = (double*)malloc(sizeof(double) * (n+1));

         for (int i = 0; i <= n; ++i)  call[i] = fmax(S0*pow(u, 
n-i)*pow(d, i)-strike, 0.0);

         for (int i = n-1; i >= 0 ; --i) {
             for (int j = 0; j <= i; ++j) {
                 call[j] = qU * call[j] + qD * call[j+1];
             }
         }

         C = call[0];
     }
     printf("%f\n", C);

     return 0;
}

------------------------

Here is the bench script:

------------------------ bench.sh

#!/bin/sh
ldc2 -O2 bench.d
clang++ -O2 bench.cpp -o bench-cpp;
time ./bench
time ./bench-cpp
time ./bench
time ./bench-cpp
time ./bench
time ./bench-cpp
time ./bench
time ./bench-cpp

------------------------

Note that I use clang-703.0.31 that comes with Xcode 7.3 that is 
based on LLVM 3.8.0 from what I can gather.
Using ldc 1.0.0-b2 which is at LLVM 3.8.0 too! Maybe the backend 
is out of the equation.

The results at -O2 (minimum of 4 samples):

// C++
real	0m0.484s
user	0m0.466s
sys	0m0.011s

// D
real	0m0.390s
user	0m0.373s
sys	0m0.012s

Why is the D code 1.25x as fast as the C++ code if they do the 
same thing?
Well I don't know, I've not analyzed further.