A small problem
bearophile
bearophileHUGS at lycos.com
Sun Dec 1 15:14:08 PST 2013
I have found a small performance problem porting some C code to D
and compiling it with LDC2. This post shows a reduced version of
the D code. In this post I am not using LTO (link time
optimization).
This version uses a private module-level N:
private __gshared private uint N = 9;
__gshared int[16 * 16] M;
int bar() nothrow {
int tot = 0;
foreach (immutable i; 0 .. N)
foreach (immutable j; 0 .. N)
tot += M[i * N + j];
return tot;
}
int main(in string[] args) nothrow {
N = 9;
M[0] = args.length;
return bar();
}
The asm of the two functions:
__D5test13barFNbZi:
pushl %ebp
pushl %ebx
pushl %edi
pushl %esi
xorl %eax, %eax
movl __D5test11Nk, %ecx
testl %ecx, %ecx
je LBB0_5
leal (,%ecx,4), %edx
xorl %eax, %eax
movl $__D5test11MG256i, %esi
xorl %edi, %edi
.align 16, 0x90
LBB0_4:
movl %esi, %ebx
movl %ecx, %ebp
.align 16, 0x90
LBB0_2:
addl (%ebx), %eax
addl $4, %ebx
decl %ebp
jne LBB0_2
addl %edx, %esi
incl %edi
cmpl %ecx, %edi
jne LBB0_4
LBB0_5:
popl %esi
popl %edi
popl %ebx
popl %ebp
ret
__Dmain:
pushl %edi
pushl %esi
movl $9, __D5test11Nk
movl 12(%esp), %eax
movl %eax, __D5test11MG256i
xorl %eax, %eax
movl $__D5test11MG256i, %ecx
xorl %edx, %edx
.align 16, 0x90
LBB1_3:
movl $9, %esi
movl %ecx, %edi
.align 16, 0x90
LBB1_1:
addl (%edi), %eax
addl $4, %edi
decl %esi
jne LBB1_1
addl $36, %ecx
incl %edx
cmpl $9, %edx
jne LBB1_3
popl %esi
popl %edi
ret
A modified program, now N is a compile-time constant:
enum uint N = 9;
__gshared int[16 * 16] M;
int bar() nothrow {
int tot = 0;
foreach (immutable i; 0 .. N)
foreach (immutable j; 0 .. N)
tot += M[i * N + j];
return tot;
}
int main(in string[] args) nothrow {
//N = 9;
M[0] = args.length;
return bar();
}
And the asm of the two functions:
__D5test23barFNbZi:
xorl %eax, %eax
movl $-324, %ecx
.align 16, 0x90
LBB0_1:
addl __D5test21MG256i+324(%ecx), %eax
addl __D5test21MG256i+328(%ecx), %eax
addl __D5test21MG256i+332(%ecx), %eax
addl __D5test21MG256i+336(%ecx), %eax
addl __D5test21MG256i+340(%ecx), %eax
addl __D5test21MG256i+344(%ecx), %eax
addl __D5test21MG256i+348(%ecx), %eax
addl __D5test21MG256i+352(%ecx), %eax
addl __D5test21MG256i+356(%ecx), %eax
addl $36, %ecx
jne LBB0_1
ret
__Dmain:
movl 4(%esp), %eax
movl %eax, __D5test21MG256i
xorl %eax, %eax
movl $-324, %ecx
.align 16, 0x90
LBB1_1:
addl __D5test21MG256i+324(%ecx), %eax
addl __D5test21MG256i+328(%ecx), %eax
addl __D5test21MG256i+332(%ecx), %eax
addl __D5test21MG256i+336(%ecx), %eax
addl __D5test21MG256i+340(%ecx), %eax
addl __D5test21MG256i+344(%ecx), %eax
addl __D5test21MG256i+348(%ecx), %eax
addl __D5test21MG256i+352(%ecx), %eax
addl __D5test21MG256i+356(%ecx), %eax
addl $36, %ecx
jne LBB1_1
ret
C code, with N global variable:
unsigned int N = 9;
int M [16 * 16];
int bar() {
int tot = 0;
int i, j;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
tot += M[i * N + j];
return tot;
}
int main(int argc, char** argv) {
N = 9;
M[0] = argc;
return bar();
}
gcc 4.8.0 with -Ofast -S gives:
_bar:
pushl %edi
pushl %esi
pushl %ebx
movl _N, %ebx
testl %ebx, %ebx
je L6
xorl %esi, %esi
xorl %edi, %edi
xorl %eax, %eax
L3:
imull %ebx, %esi
xorl %ecx, %ecx
xorl %edx, %edx
.p2align 4,,7
L5:
addl %esi, %ecx
addl $1, %edx
addl _M(,%ecx,4), %eax
cmpl %ebx, %edx
movl %edx, %ecx
jne L5
addl $1, %edi
cmpl %ebx, %edi
movl %edi, %esi
jne L3
popl %ebx
popl %esi
popl %edi
ret
L6:
popl %ebx
xorl %eax, %eax
popl %esi
popl %edi
ret
_main:
pushl %ebp
movl %esp, %ebp
pushl %esi
movl 8(%ebp), %esi
pushl %ebx
movl $_M+36, %ebx
andl $-16, %esp
call ___main
xorl %eax, %eax
xorl %ecx, %ecx
movl $9, _N
xorl %edx, %edx
movl %esi, _M
jmp L11
.p2align 4,,7
L13:
movl (%ebx), %esi
addl $36, %ebx
L11:
leal (%edx,%edx,8), %edx
addl $1, %ecx
addl %esi, %eax
addl _M+4(,%edx,4), %eax
addl _M+8(,%edx,4), %eax
addl _M+12(,%edx,4), %eax
addl _M+16(,%edx,4), %eax
addl _M+20(,%edx,4), %eax
addl _M+24(,%edx,4), %eax
addl _M+28(,%edx,4), %eax
addl _M+32(,%edx,4), %eax
cmpl $9, %ecx
movl %ecx, %edx
jne L13
leal -8(%ebp), %esp
popl %ebx
popl %esi
popl %ebp
ret
As you see the asm of the function bar is comparable, but in the
main the inlined bar is more efficient.
I have seen that the performance difference in my translation is
removed if I replace a global variable with a global constant.
But I don't know know if the asm code I've shown in this post is
really showing the problem that caused the slowdown in the D
version of the code.
Is ldc2 able to perform LTO with just a handy compilation switch
(like -flto of GCC)?
Bye,
bearophile
More information about the digitalmars-d-ldc
mailing list