Misc questions:- licensing, VC++ IDE compatible, GPGPU, LTCG, QT, SDL
bearophile
bearophileHUGS at lycos.com
Tue May 18 12:54:56 PDT 2010
- Previous message: Misc questions:- licensing, VC++ IDE compatible, GPGPU, LTCG, QT, SDL
- Next message: Misc questions:- licensing, VC++ IDE compatible, GPGPU, LTCG, QT, SDL
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
%u:
> One issue I have with the Visual C++ compiler is that it doesn't seem to support
> loop unswitching (i.e. doubling up code with boolean If statements). I wonder if
> one of the D compilers supports it. I started a thread over at cprogramming
> about it here: http://cboard.cprogramming.com/c-programming/126756-lack-compiler-loop-optimization-loop-unswitching.html
In LDC (LLVM) this optimization is named -loop-unswitch and it's present on default on -O3 and higher.
--------------------------
Your C++ code cleaned up a bit:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
double test(bool b) {
double d = 0.0;
double u = 0.0;
for (int n = 0; n < 1000000000; n++) {
d += u;
if (b)
u = sin((double)n);
}
return d;
}
int main() {
bool b = (bool)atoi("1");
printf("%f\n", test(b));
}
The asm generated of just the test() function:
g++ -O3 -S
__Z4testb:
pushl %ebp
movl %esp, %ebp
pushl %ebx
subl $36, %esp
cmpb $0, 8(%ebp)
jne L2
fldz
movl $1000000000, %eax
fld %st(0)
.p2align 4,,7
L3:
subl $1, %eax
fadd %st(1), %st
jne L3
fstp %st(1)
addl $36, %esp
popl %ebx
popl %ebp
ret
.p2align 4,,7
L2:
fldz
xorl %ebx, %ebx
fld %st(0)
jmp L5
.p2align 4,,7
L9:
fxch %st(1)
L5:
faddp %st, %st(1)
movl %ebx, -12(%ebp)
addl $1, %ebx
fildl -12(%ebp)
fstpl (%esp)
fstpl -24(%ebp)
call _sin
cmpl $1000000000, %ebx
fldl -24(%ebp)
jne L9
fstp %st(1)
addl $36, %esp
popl %ebx
popl %ebp
ret
-------------------
More aggressive compilation:
g++ -O3 -s -fomit-frame-pointer -msse3 -march=native -ffast-math -S
__Z4testb:
subl $4, %esp
cmpb $0, 8(%esp)
jne L2
movl $1000000000, %eax
.p2align 4,,10
L3:
decl %eax
jne L3
fldz
addl $4, %esp
ret
.p2align 4,,10
L2:
fldz
xorl %eax, %eax
fld %st(0)
.p2align 4,,10
L5:
movl %eax, (%esp)
faddp %st, %st(1)
incl %eax
fildl (%esp)
cmpl $1000000000, %eax
fsin
jne L5
fstp %st(0)
addl $4, %esp
ret
--------------------------
This is a D1 translation:
import tango.math.Math: sin;
import tango.stdc.stdio: printf;
import tango.stdc.stdlib: atoi;
double test(bool b) {
double d = 0.0;
double u = 0.0;
for (int n; n < 1_000_000_000; n++) {
d += u;
if (b)
u = sin(cast(double)n);
}
return d;
}
void main() {
bool b = cast(bool)atoi("1");
printf("%f\n", test(b));
}
Compiled with:
ldc -O3 -release -inline test.d
Asm produced, note the je .LBB1_4 near the top:
_D5test54testFbZd:
pushl %esi
subl $64, %esp
testb $1, %al
je .LBB1_4
pxor %xmm0, %xmm0
movsd %xmm0, 32(%esp)
movl $1000000000, %esi
movsd %xmm0, 24(%esp)
movsd %xmm0, 16(%esp)
.align 16
.LBB1_2:
movsd 32(%esp), %xmm0
movsd %xmm0, 56(%esp)
fldl 56(%esp)
fstpt (%esp)
call sinl
fstpl 48(%esp)
movsd 24(%esp), %xmm1
addsd 16(%esp), %xmm1
movsd %xmm1, 24(%esp)
decl %esi
movsd 32(%esp), %xmm0
addsd .LCPI1_0, %xmm0
movsd %xmm0, 32(%esp)
movsd 48(%esp), %xmm0
movsd %xmm0, 16(%esp)
##FP_REG_KILL
jne .LBB1_2
.LBB1_3:
movsd 24(%esp), %xmm0
movsd %xmm0, 40(%esp)
fldl 40(%esp)
addl $64, %esp
popl %esi
ret
.LBB1_4:
movl $1000000000, %eax
.align 16
.LBB1_5:
decl %eax
jne .LBB1_5
pxor %xmm0, %xmm0
movsd %xmm0, 24(%esp)
jmp .LBB1_3
This runs in about 86 seconds.
--------------------------
Aggressive compilation with LDC:
ldc -O3 -release -inline -enable-unsafe-fp-math -unroll-allow-partial test.d
_D5test54testFbZd:
subl $92, %esp
testb $1, %al
je .LBB1_4
pxor %xmm0, %xmm0
xorl %eax, %eax
movapd %xmm0, %xmm1
movapd %xmm0, %xmm2
.align 16
.LBB1_2:
leal 1(%eax), %ecx
cvtsi2sd %ecx, %xmm3
movsd %xmm3, 40(%esp)
leal 2(%eax), %ecx
cvtsi2sd %ecx, %xmm3
movsd %xmm3, 48(%esp)
leal 3(%eax), %ecx
cvtsi2sd %ecx, %xmm3
movsd %xmm3, 56(%esp)
leal 4(%eax), %ecx
cvtsi2sd %ecx, %xmm3
movsd %xmm3, 64(%esp)
movsd %xmm0, 80(%esp)
fldl 80(%esp)
fsin
fstpl 72(%esp)
fldl 40(%esp)
fsin
fstpl 8(%esp)
fldl 48(%esp)
fsin
fstpl 16(%esp)
fldl 56(%esp)
fsin
fstpl 24(%esp)
fldl 64(%esp)
fsin
fstpl 32(%esp)
addsd %xmm1, %xmm2
addsd 72(%esp), %xmm2
addsd 8(%esp), %xmm2
addsd 16(%esp), %xmm2
movapd %xmm2, %xmm1
addsd 24(%esp), %xmm1
addl $5, %eax
cmpl $1000000000, %eax
addsd .LCPI1_0, %xmm0
movsd 32(%esp), %xmm2
##FP_REG_KILL
jne .LBB1_2
.LBB1_3:
movsd %xmm1, (%esp)
fldl (%esp)
addl $92, %esp
ret
.LBB1_4:
xorl %eax, %eax
.align 16
.LBB1_5:
addl $10, %eax
cmpl $1000000000, %eax
jne .LBB1_5
pxor %xmm1, %xmm1
jmp .LBB1_3
This runs in about 58 seconds. Note also it's partially unrolled 4 times.
Here both G++ and LDC are performing loop unswitching.
Bye,
bearophile
- Previous message: Misc questions:- licensing, VC++ IDE compatible, GPGPU, LTCG, QT, SDL
- Next message: Misc questions:- licensing, VC++ IDE compatible, GPGPU, LTCG, QT, SDL
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the Digitalmars-d
mailing list