How to sum multidimensional arrays?

Fri Feb 28 03:48:59 UTC 2020

On Thursday, 27 February 2020 at 23:15:28 UTC, p.shkadzko wrote:
> And it works effortlessly!
> Sum of two 5000 x 6000 int arrays is just 0.105 sec! (on a 
> Windows machine though but with weaker CPU).
>
> I bet using mir.ndslice instead of D arrays would be even 
> faster.

Yes, the output for the following benchmark shows that Mir is 43% 
faster.
However, when I have checked the assembler output, both Mir and 
Std (really LDC in both cases) generate almost the same and best 
possible loops with AVX instructions for summation.

In another hand, Mir is faster because it generates random 
matrixes faster and uses uninitialized memory for the summation 
target.

Output:
```
std: 426 ms, 432 μs, and 1 hnsec |10
mir: 297 ms, 694 μs, and 3 hnsecs |10
```

Run command:

`dub --build=release --single --compiler=ldc2 test.d`

Note that -mcpu=native flag is passed to LDC.

Source:
```
/+dub.sdl:
dependency "mir-algorithm" version="~>3.7.17"
dependency "mir-random" version="~>2.2.10"
dflags "-mcpu=native" platform="ldc"
+/

int val;

void testStd()
{
     pragma(inline, false);
     static struct Matrix(T)
     {
         import std.range;
         T[] elems;
         int cols;

         T[][] to2D()
         {
             return elems.chunks(cols).array;
         }
     }

     static auto matrixSum(Matrix!int m1, Matrix!int m2)
     {
         Matrix!int m3;
         m3.cols = m1.cols;
         m3.elems.length = m1.elems.length;
         m3.elems[] = m1.elems[] + m2.elems[];
         return m3.to2D;
     }

     static T[] rndArr(T)(in T max, in int elems)
     {
         import std.random;
         import std.range;
         Xorshift rnd;
         return generate(() => uniform(0, max, 
rnd)).take(elems).array;
     }
     auto m1 = Matrix!int(rndArr!int(10, 5000 * 6000), 6000);
     auto m2 = Matrix!int(rndArr!int(10, 5000 * 6000), 6000);
     auto m3 = matrixSum(m1, m2);
     val = m3[$-1][$-1];
}

void testMir()
{
     pragma(inline, false);
     import mir.ndslice;
     import mir.random: threadLocal;
     import mir.random.variable: uniformVar;
     import mir.random.algorithm: randomSlice;
     import mir.random.engine.xorshift;

     auto m1 = threadLocal!Xorshift.randomSlice(uniformVar!int(0, 
10), [5000, 6000]);
     auto m2 = threadLocal!Xorshift.randomSlice(uniformVar!int(0, 
10), [5000, 6000]);
     auto m3 = slice(m1 + m2);
     val = m3[$-1][$-1];
}

void main()
{
	import std.datetime.stopwatch;
     import std.stdio;
     import core.memory;
     GC.disable;
     StopWatch clock;
     clock.reset;
     clock.start;
     testStd;
     clock.stop;
     writeln("std: ", clock.peek, " |", val);
     clock.reset;
     clock.start;
     testMir;
     clock.stop;
     writeln("mir: ", clock.peek, " |", val);
}
```