Misc questions:- licensing, VC++ IDE compatible, GPGPU, LTCG, QT, SDL

Tue May 18 12:54:56 PDT 2010

%u:
> One issue I have with the Visual C++ compiler is that it doesn't seem to support
> loop unswitching (i.e. doubling up code with boolean If statements). I wonder if
> one of the D compilers supports it. I started a thread over at cprogramming
> about it here: http://cboard.cprogramming.com/c-programming/126756-lack-compiler-loop-optimization-loop-unswitching.html

In LDC (LLVM) this optimization is named -loop-unswitch and it's present on default on -O3 and higher.

--------------------------

Your C++ code cleaned up a bit:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

double test(bool b) {
	double d = 0.0;
	double u = 0.0;
	for (int n = 0; n < 1000000000; n++) {
		d += u;
		if (b)
		    u = sin((double)n);		
	}	
	return d;
}

int main() {
    bool b = (bool)atoi("1");
    printf("%f\n", test(b));    
}

The asm generated of just the test() function:
g++ -O3 -S

__Z4testb:
	pushl	%ebp
	movl	%esp, %ebp
	pushl	%ebx
	subl	$36, %esp
	cmpb	$0, 8(%ebp)
	jne	L2
	fldz
	movl	$1000000000, %eax
	fld	%st(0)
	.p2align 4,,7
L3:
	subl	$1, %eax
	fadd	%st(1), %st
	jne	L3
	fstp	%st(1)
	addl	$36, %esp
	popl	%ebx
	popl	%ebp
	ret
	.p2align 4,,7
L2:
	fldz
	xorl	%ebx, %ebx
	fld	%st(0)
	jmp	L5
	.p2align 4,,7
L9:
	fxch	%st(1)
L5:
	faddp	%st, %st(1)
	movl	%ebx, -12(%ebp)
	addl	$1, %ebx
	fildl	-12(%ebp)
	fstpl	(%esp)
	fstpl	-24(%ebp)
	call	_sin
	cmpl	$1000000000, %ebx
	fldl	-24(%ebp)
	jne	L9
	fstp	%st(1)
	addl	$36, %esp
	popl	%ebx
	popl	%ebp
	ret

-------------------

More aggressive compilation:
g++ -O3 -s -fomit-frame-pointer -msse3 -march=native -ffast-math -S

__Z4testb:
	subl	$4, %esp
	cmpb	$0, 8(%esp)
	jne	L2
	movl	$1000000000, %eax
	.p2align 4,,10
L3:
	decl	%eax
	jne	L3
	fldz
	addl	$4, %esp
	ret
	.p2align 4,,10
L2:
	fldz
	xorl	%eax, %eax
	fld	%st(0)
	.p2align 4,,10
L5:
	movl	%eax, (%esp)
	faddp	%st, %st(1)
	incl	%eax
	fildl	(%esp)
	cmpl	$1000000000, %eax
	fsin
	jne	L5
	fstp	%st(0)
	addl	$4, %esp
	ret

--------------------------

This is a D1 translation:

import tango.math.Math: sin;
import tango.stdc.stdio: printf;
import tango.stdc.stdlib: atoi;

double test(bool b) {
    double d = 0.0;
    double u = 0.0;
    for (int n; n < 1_000_000_000; n++) {
        d += u;
        if (b)
            u = sin(cast(double)n);
    }

    return d;
}

void main() {
    bool b = cast(bool)atoi("1");
    printf("%f\n", test(b));    
}

Compiled with:
ldc -O3 -release -inline test.d
Asm produced, note the je .LBB1_4 near the top:

_D5test54testFbZd:
	pushl	%esi
	subl	$64, %esp
	testb	$1, %al
	je	.LBB1_4
	pxor	%xmm0, %xmm0
	movsd	%xmm0, 32(%esp)
	movl	$1000000000, %esi
	movsd	%xmm0, 24(%esp)
	movsd	%xmm0, 16(%esp)
	.align	16
.LBB1_2:
	movsd	32(%esp), %xmm0
	movsd	%xmm0, 56(%esp)
	fldl	56(%esp)
	fstpt	(%esp)
	call	sinl
	fstpl	48(%esp)
	movsd	24(%esp), %xmm1
	addsd	16(%esp), %xmm1
	movsd	%xmm1, 24(%esp)
	decl	%esi
	movsd	32(%esp), %xmm0
	addsd	.LCPI1_0, %xmm0
	movsd	%xmm0, 32(%esp)
	movsd	48(%esp), %xmm0
	movsd	%xmm0, 16(%esp)
	##FP_REG_KILL
	jne	.LBB1_2
.LBB1_3:
	movsd	24(%esp), %xmm0
	movsd	%xmm0, 40(%esp)
	fldl	40(%esp)
	addl	$64, %esp
	popl	%esi
	ret
.LBB1_4:
	movl	$1000000000, %eax
	.align	16
.LBB1_5:
	decl	%eax
	jne	.LBB1_5
	pxor	%xmm0, %xmm0
	movsd	%xmm0, 24(%esp)
	jmp	.LBB1_3

This runs in about 86 seconds.

--------------------------

Aggressive compilation with LDC:
ldc -O3 -release -inline -enable-unsafe-fp-math -unroll-allow-partial test.d

_D5test54testFbZd:
	subl	$92, %esp
	testb	$1, %al
	je	.LBB1_4
	pxor	%xmm0, %xmm0
	xorl	%eax, %eax
	movapd	%xmm0, %xmm1
	movapd	%xmm0, %xmm2
	.align	16
.LBB1_2:
	leal	1(%eax), %ecx
	cvtsi2sd	%ecx, %xmm3
	movsd	%xmm3, 40(%esp)
	leal	2(%eax), %ecx
	cvtsi2sd	%ecx, %xmm3
	movsd	%xmm3, 48(%esp)
	leal	3(%eax), %ecx
	cvtsi2sd	%ecx, %xmm3
	movsd	%xmm3, 56(%esp)
	leal	4(%eax), %ecx
	cvtsi2sd	%ecx, %xmm3
	movsd	%xmm3, 64(%esp)
	movsd	%xmm0, 80(%esp)
	fldl	80(%esp)
	fsin
	fstpl	72(%esp)
	fldl	40(%esp)
	fsin
	fstpl	8(%esp)
	fldl	48(%esp)
	fsin
	fstpl	16(%esp)
	fldl	56(%esp)
	fsin
	fstpl	24(%esp)
	fldl	64(%esp)
	fsin
	fstpl	32(%esp)
	addsd	%xmm1, %xmm2
	addsd	72(%esp), %xmm2
	addsd	8(%esp), %xmm2
	addsd	16(%esp), %xmm2
	movapd	%xmm2, %xmm1
	addsd	24(%esp), %xmm1
	addl	$5, %eax
	cmpl	$1000000000, %eax
	addsd	.LCPI1_0, %xmm0
	movsd	32(%esp), %xmm2
	##FP_REG_KILL
	jne	.LBB1_2
.LBB1_3:
	movsd	%xmm1, (%esp)
	fldl	(%esp)
	addl	$92, %esp
	ret
.LBB1_4:
	xorl	%eax, %eax
	.align	16
.LBB1_5:
	addl	$10, %eax
	cmpl	$1000000000, %eax
	jne	.LBB1_5
	pxor	%xmm1, %xmm1
	jmp	.LBB1_3

This runs in about 58 seconds. Note also it's partially unrolled 4 times.

Here both G++ and LDC are performing loop unswitching.

Bye,
bearophile