One document about Go

Tue Jun 8 12:05:40 PDT 2010

I have tested dmd v2.047beta and it works. I have seen this bug fix improves the performance of my code:
http://d.puremagic.com/issues/show_bug.cgi?id=2008

------------------

Walter Bright:

>it would be fair to provide examples when saying things like LLVM does a better job on X.<

This is a D2 program, I compile it with v2.047beta, the total() function doesn't get inlined:

import std.c.stdio: printf;

int total(int[] data) {
    int res;
    foreach (x; data)
        res += x;
    return res;
}

void main() {
    enum int[] data = [7, 6, 5, 9, 8, 4, 3, 1, 2, 0];
    printf("%d\n", total(data));
}

asm generated by DMD, -O -release -inline (cleaned up a little):

_D4test5totalFAiZi	comdat
		push	EAX
		xor	ECX,ECX
		xor	EDX,EDX
		push	EBX
		cmp	0Ch[ESP],ECX
		je	L28
		mov	4[ESP],EDX
		mov	EDX,010h[ESP]
		mov	EBX,EDX
		mov	EAX,0Ch[ESP]
		mov	EDX,4[ESP]
L1E:		add	ECX,[EDX*4][EBX]
		inc	EDX
		cmp	EDX,0Ch[ESP]
		jb	L1E
L28:		pop	EBX
		mov	EAX,ECX
		pop	ECX
		ret	8

__Dmain	comdat
L0:		push	EAX
		push	EAX
		mov	EAX,offset FLAT:_D11TypeInfo_Ai6__initZ
		push	EBX
		push	0
		push	2
		push	1
		push	3
		push	4
		push	8
		push	9
		push	5
		push	6
		push	7
		push	0Ah
		push	EAX
		call	near ptr __d_arrayliteralT
		add	ESP,030h
		mov	ECX,EAX
		push	ECX
		mov	EBX,0Ah
		push	EBX
		call	near ptr _D4test5totalFAiZi
		mov	EDX,offset FLAT:_DATA
		push	EAX
		push	EDX
		call	near ptr _printf
		add	ESP,8
		xor	EAX,EAX
		pop	EBX
		add	ESP,8
		ret

------------------

This is the same program translated to D1 for Tango:

import tango.stdc.stdio: printf;

int total(int[] data) {
    int res;
    foreach (x; data)
        res += x;
    return res;
}

void main() {
    const int[] data = [7, 6, 5, 9, 8, 4, 3, 1, 2, 0];
    printf("%d\n", total(data));
}

The asm generated by LDC, -O3 -release -inline:

_D4temp5totalFAiZi:
	pushl	%esi
	movl	8(%esp), %ecx
	testl	%ecx, %ecx
	je	.LBB1_4
	movl	12(%esp), %edx
	xorl	%eax, %eax
	movl	%eax, %esi
	.align	16
.LBB1_2:
	addl	(%edx,%esi,4), %eax
	incl	%esi
	cmpl	%ecx, %esi
	jne	.LBB1_2
.LBB1_3:
	popl	%esi
	ret	$8
.LBB1_4:
	xorl	%eax, %eax
	jmp	.LBB1_3

	.type	.constarray, at object
	.data
	.align	16
.constarray:
	.long	7
	.long	6
	.long	5
	.long	9
	.long	8
	.long	4
	.long	3
	.long	1
	.long	2
	.zero	4
	.size	.constarray, 40

_Dmain:
	subl	$12, %esp
	movl	.constarray+4, %eax
	addl	.constarray, %eax
	addl	.constarray+8, %eax
	addl	.constarray+12, %eax
	addl	.constarray+16, %eax
	addl	.constarray+20, %eax
	addl	.constarray+24, %eax
	addl	.constarray+28, %eax
	addl	.constarray+32, %eax
	addl	.constarray+36, %eax
	movl	%eax, 4(%esp)
	movl	$.str, (%esp)
	call	printf
	xorl	%eax, %eax
	addl	$12, %esp
	ret	$8

You can see ldc inlined total(), and in this case unrolls the loop too because the array is known at compile time. But it doesn't perform the last optimization.

---------------------------

If I use the Link-Time optimization with LDC it optimizes the code better, this is the disassembly of the main:

08049620 <_Dmain>:
 8049620:	83 ec 0c             	sub    $0xc,%esp
 8049623:	c7 44 24 04 2d 00 00 	movl   $0x2d,0x4(%esp)
 804962a:	00 
 804962b:	c7 04 24 48 35 06 08 	movl   $0x8063548,(%esp)
 8049632:	e8 85 fd ff ff       	call   80493bc <printf at plt>
 8049637:	31 c0                	xor    %eax,%eax
 8049639:	83 c4 0c             	add    $0xc,%esp
 804963c:	c2 08 00             	ret    $0x8
 804963f:	90                   	nop    

That movl $0x2d,0x4(%esp) is the result, 45 in base 10, fully computed.

Bye,
bearophile