A small problem

Sun Dec 1 15:14:08 PST 2013

I have found a small performance problem porting some C code to D 
and compiling it with LDC2. This post shows a reduced version of 
the D code. In this post I am not using LTO (link time 
optimization).

This version uses a private module-level N:

private __gshared private uint N = 9;
__gshared int[16 * 16] M;

int bar() nothrow {
     int tot = 0;
     foreach (immutable i; 0 .. N)
         foreach (immutable j; 0 .. N)
             tot += M[i * N + j];
     return tot;
}

int main(in string[] args) nothrow {
     N = 9;
     M[0] = args.length;
     return bar();
}

The asm of the two functions:

__D5test13barFNbZi:
     pushl   %ebp
     pushl   %ebx
     pushl   %edi
     pushl   %esi
     xorl    %eax, %eax
     movl    __D5test11Nk, %ecx
     testl   %ecx, %ecx
     je  LBB0_5
     leal    (,%ecx,4), %edx
     xorl    %eax, %eax
     movl    $__D5test11MG256i, %esi
     xorl    %edi, %edi
     .align  16, 0x90
LBB0_4:
     movl    %esi, %ebx
     movl    %ecx, %ebp
     .align  16, 0x90
LBB0_2:
     addl    (%ebx), %eax
     addl    $4, %ebx
     decl    %ebp
     jne LBB0_2
     addl    %edx, %esi
     incl    %edi
     cmpl    %ecx, %edi
     jne LBB0_4
LBB0_5:
     popl    %esi
     popl    %edi
     popl    %ebx
     popl    %ebp
     ret

__Dmain:
     pushl   %edi
     pushl   %esi
     movl    $9, __D5test11Nk
     movl    12(%esp), %eax
     movl    %eax, __D5test11MG256i
     xorl    %eax, %eax
     movl    $__D5test11MG256i, %ecx
     xorl    %edx, %edx
     .align  16, 0x90
LBB1_3:
     movl    $9, %esi
     movl    %ecx, %edi
     .align  16, 0x90
LBB1_1:
     addl    (%edi), %eax
     addl    $4, %edi
     decl    %esi
     jne LBB1_1
     addl    $36, %ecx
     incl    %edx
     cmpl    $9, %edx
     jne LBB1_3
     popl    %esi
     popl    %edi
     ret

A modified program, now N is a compile-time constant:

enum uint N = 9;
__gshared int[16 * 16] M;

int bar() nothrow {
     int tot = 0;
     foreach (immutable i; 0 .. N)
         foreach (immutable j; 0 .. N)
             tot += M[i * N + j];
     return tot;
}

int main(in string[] args) nothrow {
     //N = 9;
     M[0] = args.length;
     return bar();
}

And the asm of the two functions:

__D5test23barFNbZi:
     xorl    %eax, %eax
     movl    $-324, %ecx
     .align  16, 0x90
LBB0_1:
     addl    __D5test21MG256i+324(%ecx), %eax
     addl    __D5test21MG256i+328(%ecx), %eax
     addl    __D5test21MG256i+332(%ecx), %eax
     addl    __D5test21MG256i+336(%ecx), %eax
     addl    __D5test21MG256i+340(%ecx), %eax
     addl    __D5test21MG256i+344(%ecx), %eax
     addl    __D5test21MG256i+348(%ecx), %eax
     addl    __D5test21MG256i+352(%ecx), %eax
     addl    __D5test21MG256i+356(%ecx), %eax
     addl    $36, %ecx
     jne LBB0_1
     ret

__Dmain:
     movl    4(%esp), %eax
     movl    %eax, __D5test21MG256i
     xorl    %eax, %eax
     movl    $-324, %ecx
     .align  16, 0x90
LBB1_1:
     addl    __D5test21MG256i+324(%ecx), %eax
     addl    __D5test21MG256i+328(%ecx), %eax
     addl    __D5test21MG256i+332(%ecx), %eax
     addl    __D5test21MG256i+336(%ecx), %eax
     addl    __D5test21MG256i+340(%ecx), %eax
     addl    __D5test21MG256i+344(%ecx), %eax
     addl    __D5test21MG256i+348(%ecx), %eax
     addl    __D5test21MG256i+352(%ecx), %eax
     addl    __D5test21MG256i+356(%ecx), %eax
     addl    $36, %ecx
     jne LBB1_1
     ret

C code, with N global variable:

unsigned int N = 9;
int M [16 * 16];

int bar() {
     int tot = 0;
     int i, j;
     for (i = 0; i < N; i++)
         for (j = 0; j < N; j++)
             tot += M[i * N + j];
     return tot;
}

int main(int argc, char** argv) {
     N = 9;
     M[0] = argc;
     return bar();
}

gcc 4.8.0 with -Ofast -S gives:

_bar:
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	movl	_N, %ebx
	testl	%ebx, %ebx
	je	L6
	xorl	%esi, %esi
	xorl	%edi, %edi
	xorl	%eax, %eax
L3:
	imull	%ebx, %esi
	xorl	%ecx, %ecx
	xorl	%edx, %edx
	.p2align 4,,7
L5:
	addl	%esi, %ecx
	addl	$1, %edx
	addl	_M(,%ecx,4), %eax
	cmpl	%ebx, %edx
	movl	%edx, %ecx
	jne	L5
	addl	$1, %edi
	cmpl	%ebx, %edi
	movl	%edi, %esi
	jne	L3
	popl	%ebx
	popl	%esi
	popl	%edi
	ret
L6:
	popl	%ebx
	xorl	%eax, %eax
	popl	%esi
	popl	%edi
	ret

_main:
	pushl	%ebp
	movl	%esp, %ebp
	pushl	%esi
	movl	8(%ebp), %esi
	pushl	%ebx
	movl	$_M+36, %ebx
	andl	$-16, %esp
	call	___main
	xorl	%eax, %eax
	xorl	%ecx, %ecx
	movl	$9, _N
	xorl	%edx, %edx
	movl	%esi, _M
	jmp	L11
	.p2align 4,,7
L13:
	movl	(%ebx), %esi
	addl	$36, %ebx
L11:
	leal	(%edx,%edx,8), %edx
	addl	$1, %ecx
	addl	%esi, %eax
	addl	_M+4(,%edx,4), %eax
	addl	_M+8(,%edx,4), %eax
	addl	_M+12(,%edx,4), %eax
	addl	_M+16(,%edx,4), %eax
	addl	_M+20(,%edx,4), %eax
	addl	_M+24(,%edx,4), %eax
	addl	_M+28(,%edx,4), %eax
	addl	_M+32(,%edx,4), %eax
	cmpl	$9, %ecx
	movl	%ecx, %edx
	jne	L13
	leal	-8(%ebp), %esp
	popl	%ebx
	popl	%esi
	popl	%ebp
	ret

As you see the asm of the function bar is comparable, but in the 
main the inlined bar is more efficient.

I have seen that the performance difference in my translation is 
removed if I replace a global variable with a global constant. 
But I don't know know if the asm code I've shown in this post is 
really showing the problem that caused the slowdown in the D 
version of the code.

Is ldc2 able to perform LTO with just a handy compilation switch 
(like -flto of GCC)?

Bye,
bearophile