How to tune numerical D? (matrix multiplication is faster in g++ vs gdc)
J
private at private-dont-email-dont-spam.com
Sun Mar 3 20:50:59 PST 2013
On Monday, 4 March 2013 at 04:22:01 UTC, bearophile wrote:
> So this should be better:
>
> http://codepad.org/B5b4uyBM
>
> Bye,
> bearophile
@bearophile: Thank you! Unfortunately the
http://codepad.org/B5b4uyBM code runs a bit *slower* than the
original D code. Yikes!
$ gdmd -O -inline -release -noboundscheck -m64 bear.d -ofdbear
$ time ./dbear
-1015380632 859379360 -367726792 -1548829944
real 2m36.971s
user 2m36.910s
sys 0m0.030s
$ time ./dbear
-1015380632 859379360 -367726792 -1548829944
real 2m34.425s
user 2m34.370s
sys 0m0.020s
@John Colvin: here is the disassembly of mmult() in both
languages. Unfortunately I'm not literate in x86_64 assembly.
Perhaps the problem is obvious to you? All I can really tell is
that the g++ version is shorter.
The memory allocation, when timed separately (comment out mmult),
is less than 60 msec for either version, so I don't think its a
memory issue, although it could be caching issue since the matrix
layouts are different.
### gdc version of mmult:
(gdb) disas /m _D6matrix5mmultFAAiAAiAAiZv
Dump of assembler code for function _D6matrix5mmultFAAiAAiAAiZv:
56 void mmult(int[][] m1, int[][] m2, int[][] m3)
0x00000000004352a0 <+0>: push %r15
0x00000000004352a5 <+5>: push %r14
0x00000000004352a7 <+7>: push %r13
0x00000000004352a9 <+9>: push %r12
0x00000000004352ab <+11>: mov %rdx,%r12
0x00000000004352ae <+14>: push %rbp
0x00000000004352af <+15>: mov %rsi,%rbp
0x00000000004352b2 <+18>: push %rbx
0x00000000004352b3 <+19>: mov %r9,-0x40(%rsp)
0x00000000004352b8 <+24>: mov %rdi,-0x10(%rsp)
0x00000000004352bd <+29>: mov %rsi,-0x8(%rsp)
0x00000000004352c2 <+34>: mov %rdx,-0x20(%rsp)
0x00000000004352c7 <+39>: mov %rcx,-0x18(%rsp)
0x00000000004352cc <+44>: mov %r8,-0x30(%rsp)
0x00000000004352d1 <+49>: mov %r9,-0x28(%rsp)
0x00000000004352dc <+60>: add $0x1,%rdi
0x00000000004352e0 <+64>: lea 0x1(%rdx),%rdx
0x00000000004352e4 <+68>: mov $0x1,%r15d
0x00000000004352ea <+74>: mov %rdi,-0x38(%rsp)
0x0000000000435315 <+117>: add $0x1,%r13
0x0000000000435319 <+121>: mov $0x1,%r11d
0x0000000000435330 <+144>: mov $0x1,%esi
57 {
58 foreach(int i, int[] m1i; m1)
0x00000000004352a2 <+2>: test %rdi,%rdi
0x00000000004352d6 <+54>: je 0x4353aa
<_D6matrix5mmultFAAiAAiAAiZv+266>
0x00000000004352ef <+79>: xor %esi,%esi
0x00000000004352f1 <+81>: mov %rsi,%rax
0x00000000004352f4 <+84>: shl $0x4,%rax
0x00000000004352f8 <+88>: mov 0x8(%rbp,%rax,1),%r10
0x0000000000435398 <+248>: cmp -0x38(%rsp),%rax
0x000000000043539d <+253>: mov %r15,%rsi
0x00000000004353a0 <+256>: je 0x4353aa
<_D6matrix5mmultFAAiAAiAAiZv+266>
0x00000000004353a2 <+258>: mov %rax,%r15
0x00000000004353a5 <+261>: jmpq 0x4352f1
<_D6matrix5mmultFAAiAAiAAiZv+81>
59 {
60 foreach(int j, ref int m3ij; m3[i])
0x00000000004352fd <+93>: add -0x40(%rsp),%rax
0x0000000000435302 <+98>: mov (%rax),%r13
0x0000000000435305 <+101>: mov 0x8(%rax),%r14
0x0000000000435309 <+105>: test %r13,%r13
0x000000000043530c <+108>: je 0x435394
<_D6matrix5mmultFAAiAAiAAiZv+244>
0x0000000000435312 <+114>: xor %r9d,%r9d
0x000000000043531f <+127>: shl $0x2,%r9
0x0000000000435326 <+134>: lea (%r14,%r9,1),%rbx
0x000000000043536c <+204>: mov %r11,%r9
0x000000000043536f <+207>: cmp %r13,%rax
0x0000000000435372 <+210>: je 0x435394
<_D6matrix5mmultFAAiAAiAAiZv+244>
0x0000000000435374 <+212>: shl $0x2,%r9
0x000000000043537b <+219>: mov %rax,%r11
0x000000000043537e <+222>: lea (%r14,%r9,1),%rbx
0x000000000043538a <+234>: mov %r11,%r9
0x000000000043538f <+239>: cmp %r13,%rax
0x0000000000435392 <+242>: jne 0x435374
<_D6matrix5mmultFAAiAAiAAiZv+212>
0x0000000000435394 <+244>: lea 0x1(%r15),%rax
61 {
62 int val;
0x0000000000435337 <+151>: xor %edi,%edi
0x0000000000435339 <+153>: jmp 0x435343
<_D6matrix5mmultFAAiAAiAAiZv+163>
0x000000000043533b <+155>: nopl 0x0(%rax,%rax,1)
0x0000000000435388 <+232>: xor %edi,%edi
63 foreach(int k, int[] m2k; m2)
0x0000000000435323 <+131>: test %r12,%r12
0x000000000043532a <+138>: je 0x435384
<_D6matrix5mmultFAAiAAiAAiZv+228>
0x000000000043532c <+140>: nopl 0x0(%rax)
0x0000000000435335 <+149>: xor %eax,%eax
0x0000000000435340 <+160>: mov %r8,%rsi
0x0000000000435343 <+163>: mov %rax,%r8
0x000000000043534a <+170>: shl $0x4,%r8
0x000000000043535e <+190>: cmp %rdx,%r8
0x0000000000435361 <+193>: mov %rsi,%rax
0x0000000000435364 <+196>: jne 0x435340
<_D6matrix5mmultFAAiAAiAAiZv+160>
0x0000000000435366 <+198>: lea 0x1(%r11),%rax
0x0000000000435378 <+216>: test %r12,%r12
0x0000000000435382 <+226>: jne 0x435330
<_D6matrix5mmultFAAiAAiAAiZv+144>
0x0000000000435384 <+228>: lea 0x1(%r11),%rax
64 {
65 val += m1i[k] * m2k[j];
0x0000000000435346 <+166>: mov (%r10,%rax,4),%eax
0x000000000043534e <+174>: mov 0x8(%rcx,%r8,1),%r8
0x0000000000435353 <+179>: imul (%r8,%r9,1),%eax
0x0000000000435358 <+184>: lea 0x1(%rsi),%r8
0x000000000043535c <+188>: add %eax,%edi
66 }
67 m3ij = val;
0x000000000043536a <+202>: mov %edi,(%rbx)
0x000000000043538d <+237>: mov %edi,(%rbx)
68 }
69 }
70 }
0x00000000004353aa <+266>: pop %rbx
0x00000000004353ab <+267>: pop %rbp
0x00000000004353ac <+268>: pop %r12
0x00000000004353ae <+270>: pop %r13
0x00000000004353b0 <+272>: pop %r14
0x00000000004353b2 <+274>: pop %r15
0x00000000004353b4 <+276>: retq
0x00000000004353b5: data32 nopw %cs:0x0(%rax,%rax,1)
End of assembler dump.
(gdb)
### g++ version of mmult:
(gdb) disas /m mmult
Dump of assembler code for function mmult(int, int, int**, int**,
int**):
36 int **mmult(int rows, int cols, int **m1, int **m2, int **m3) {
0x0000000000400a10 <+0>: push %r14
0x0000000000400a14 <+4>: push %r13
0x0000000000400a16 <+6>: mov %r8,%r13
0x0000000000400a19 <+9>: push %r12
0x0000000000400a1b <+11>: push %rbp
0x0000000000400a1c <+12>: push %rbx
0x0000000000400a1d <+13>: mov %edi,%ebx
0x0000000000400a21 <+17>: lea -0x1(%rsi),%eax
0x0000000000400a24 <+20>: mov %rdx,%r12
0x0000000000400a27 <+23>: xor %ebp,%ebp
0x0000000000400a29 <+25>: lea 0x4(,%rax,4),%rdi
0x0000000000400a40 <+48>: xor %r9d,%r9d
0x0000000000400a43 <+51>: xor %r11d,%r11d
0x0000000000400a46 <+54>: nopw %cs:0x0(%rax,%rax,1)
37 int i, j, k, val;
38 for (i=0; i<rows; i++) {
0x0000000000400a12 <+2>: test %edi,%edi
0x0000000000400a1f <+15>: jle 0x400a7e <mmult(int, int,
int**, int**, int**)+110>
0x0000000000400a7a <+106>: cmp %ebp,%ebx
0x0000000000400a7c <+108>: jg 0x400a31 <mmult(int, int,
int**, int**, int**)+33>
39 for (j=0; j<cols; j++) {
0x0000000000400a31 <+33>: test %esi,%esi
0x0000000000400a33 <+35>: jle 0x400a76 <mmult(int, int,
int**, int**, int**)+102>
0x0000000000400a35 <+37>: mov (%r12,%rbp,8),%r8
0x0000000000400a39 <+41>: mov 0x0(%r13,%rbp,8),%rdx
0x0000000000400a3e <+46>: xor %eax,%eax
0x0000000000400a71 <+97>: cmp %rdi,%rax
0x0000000000400a74 <+100>: jne 0x400a40 <mmult(int, int,
int**, int**, int**)+48>
0x0000000000400a76 <+102>: add $0x1,%rbp
40 val = 0;
41 for (k=0; k<cols; k++) {
0x0000000000400a64 <+84>: cmp %r9d,%esi
0x0000000000400a67 <+87>: jg 0x400a50 <mmult(int, int,
int**, int**, int**)+64>
42 val += m1[i][k] * m2[k][j];
0x0000000000400a50 <+64>: mov (%rcx,%r9,8),%r14
0x0000000000400a54 <+68>: mov (%r8,%r9,4),%r10d
0x0000000000400a58 <+72>: add $0x1,%r9
0x0000000000400a5c <+76>: imul (%r14,%rax,1),%r10d
0x0000000000400a61 <+81>: add %r10d,%r11d
43 }
44 m3[i][j] = val;
0x0000000000400a69 <+89>: mov %r11d,(%rdx,%rax,1)
0x0000000000400a6d <+93>: add $0x4,%rax
45 }
46 }
47 return(m3);
48 }
0x0000000000400a7e <+110>: pop %rbx
0x0000000000400a7f <+111>: pop %rbp
0x0000000000400a80 <+112>: pop %r12
0x0000000000400a82 <+114>: mov %r13,%rax
0x0000000000400a85 <+117>: pop %r13
0x0000000000400a87 <+119>: pop %r14
0x0000000000400a89 <+121>: retq
End of assembler dump.
(gdb)
More information about the Digitalmars-d
mailing list