floor operation problem
bearophile
bearophileHUGS at lycos.com
Wed Dec 18 16:15:27 PST 2013
While I was debugging a performance problem, I have found the
cause is the floor operation. Below there is a small benchmark to
show it.
I have compiled the code with:
gcc -Ofast -std=c99 -s -flto -mfpmath=sse -ffast-math -msse3
test1.c -o test1
ldmd2 -O -release -inline -noboundscheck test2.d
ldmd2 -O -release -inline -noboundscheck test3.d
32 bit system
gcc version 4.8.0
LDC 0.12.1 (based on DMD v2.063.2 and LLVM 3.3.1)
Run-time, seconds:
test1.c: 1.01
test2.d: 4.14
test3.d: 6.62
---------------------
// test1.c
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
static inline float foo(const float x, const float y) {
return floorf(x) + floorf(y);
}
int main() {
float total = 0.0f;
for (int i = 0; i < 1000; i++)
for (int y = 0; y < 256; y++)
for (int x = 0; x < 256; x++)
total += foo(x * 0.1f, y * 0.1f);
printf("%f\n", total);
return 0;
}
---------------------
// test2.d
import core.stdc.stdio, core.stdc.math;
float foo(const float x, const float y) nothrow {
return floorf(x) + floorf(y);
}
int main() {
float total = 0.0f;
for (int i = 0; i < 1000; i++)
for (int y = 0; y < 256; y++)
for (int x = 0; x < 256; x++)
total += foo(x * 0.1f, y * 0.1f);
printf("%f\n", total);
return 0;
}
---------------------
// test3.d
import core.stdc.stdio, std.math;
float foo(const float x, const float y) nothrow {
return floor(x) + floor(y);
}
int main() {
float total = 0.0f;
for (int i = 0; i < 1000; i++)
for (int y = 0; y < 256; y++)
for (int x = 0; x < 256; x++)
total += foo(x * 0.1f, y * 0.1f);
printf("%f\n", total);
return 0;
}
---------------------
test1.c asm:
_main:
pushl %ebp
movl %esp, %ebp
pushl %ebx
movl $1000, %ebx
andl $-16, %esp
subl $16, %esp
call ___main
xorps %xmm1, %xmm1
movss LC3, %xmm5
L2:
movss LC1, %xmm6
xorps %xmm3, %xmm3
xorl %ecx, %ecx
.p2align 4,,7
L9:
movss LC2, %xmm4
xorps %xmm0, %xmm0
xorl %eax, %eax
.p2align 4,,7
L7:
addss %xmm0, %xmm1
addl $1, %eax
cmpl $256, %eax
addss %xmm3, %xmm1
je L12
cvtsi2ss %eax, %xmm0
mulss %xmm6, %xmm0
movaps %xmm0, %xmm2
andps %xmm5, %xmm2
ucomiss %xmm2, %xmm4
jbe L7
cvttss2si %xmm0, %edx
cvtsi2ss %edx, %xmm2
movaps %xmm2, %xmm7
cmpnless %xmm0, %xmm7
movaps %xmm7, %xmm0
movss LC4, %xmm7
andps %xmm7, %xmm0
subss %xmm0, %xmm2
movaps %xmm2, %xmm0
jmp L7
.p2align 4,,7
L12:
addl $1, %ecx
cmpl $256, %ecx
je L5
cvtsi2ss %ecx, %xmm3
movss LC6, %xmm0
movss LC2, %xmm2
mulss LC1, %xmm3
andps %xmm3, %xmm0
ucomiss %xmm0, %xmm2
jbe L9
cvttss2si %xmm3, %eax
cvtsi2ss %eax, %xmm0
movaps %xmm0, %xmm2
cmpnless %xmm3, %xmm2
movss LC4, %xmm3
andps %xmm3, %xmm2
movaps %xmm0, %xmm3
subss %xmm2, %xmm3
jmp L9
L5:
subl $1, %ebx
jne L2
unpcklps %xmm1, %xmm1
movl $LC5, (%esp)
cvtps2pd %xmm1, %xmm5
movsd %xmm5, 4(%esp)
call _printf
xorl %eax, %eax
movl -4(%ebp), %ebx
leave
ret
---------------------
test2.d asm:
__Dmain:
pushl %ebx
pushl %edi
pushl %esi
subl $28, %esp
xorps %xmm0, %xmm0
xorl %esi, %esi
movss LCPI1_0, %xmm1
.align 16, 0x90
LBB1_1:
xorl %edi, %edi
.align 16, 0x90
LBB1_2:
xorps %xmm2, %xmm2
cvtsi2ssl %edi, %xmm2
mulss %xmm1, %xmm2
movss %xmm2, 12(%esp)
xorl %ebx, %ebx
.align 16, 0x90
LBB1_3:
movss %xmm0, 16(%esp)
xorps %xmm0, %xmm0
cvtsi2ssl %ebx, %xmm0
mulss %xmm1, %xmm0
movss %xmm0, (%esp)
calll _floorf
movss 12(%esp), %xmm0
movss %xmm0, (%esp)
fstps 24(%esp)
calll _floorf
movss LCPI1_0, %xmm1
fstps 20(%esp)
movss 24(%esp), %xmm0
addss 20(%esp), %xmm0
movss 16(%esp), %xmm2
addss %xmm0, %xmm2
movss %xmm2, 16(%esp)
movss 16(%esp), %xmm0
incl %ebx
cmpl $256, %ebx
jne LBB1_3
incl %edi
cmpl $256, %edi
jne LBB1_2
incl %esi
cmpl $1000, %esi
jne LBB1_1
cvtss2sd %xmm0, %xmm0
movsd %xmm0, 4(%esp)
movl $_.str, (%esp)
calll ___mingw_printf
xorl %eax, %eax
addl $28, %esp
popl %esi
popl %edi
popl %ebx
ret
---------------------
test3.d asm:
__Dmain:
pushl %ebx
pushl %edi
pushl %esi
subl $52, %esp
xorps %xmm1, %xmm1
xorl %esi, %esi
movss LCPI1_0, %xmm2
.align 16, 0x90
LBB1_1:
xorl %edi, %edi
.align 16, 0x90
LBB1_2:
xorps %xmm0, %xmm0
cvtsi2ssl %edi, %xmm0
mulss %xmm2, %xmm0
movss %xmm0, 48(%esp)
xorl %ebx, %ebx
flds 48(%esp)
fstpt 12(%esp)
movaps %xmm1, %xmm0
.align 16, 0x90
LBB1_3:
movss %xmm0, 36(%esp)
xorps %xmm0, %xmm0
cvtsi2ssl %ebx, %xmm0
mulss %xmm2, %xmm0
movss %xmm0, 44(%esp)
flds 44(%esp)
fstpt (%esp)
calll __D3std4math5floorFNbNeeZe
subl $12, %esp
fstpt 24(%esp)
fldt 12(%esp)
fstpt (%esp)
calll __D3std4math5floorFNbNeeZe
subl $12, %esp
movss 36(%esp), %xmm0
movss LCPI1_0, %xmm2
fldt 24(%esp)
faddp %st(1)
fstps 40(%esp)
addss 40(%esp), %xmm0
incl %ebx
cmpl $256, %ebx
jne LBB1_3
movaps %xmm0, %xmm1
incl %edi
cmpl $256, %edi
jne LBB1_2
incl %esi
cmpl $1000, %esi
jne LBB1_1
xorps %xmm0, %xmm0
cvtss2sd %xmm1, %xmm0
movsd %xmm0, 4(%esp)
movl $_.str, (%esp)
calll ___mingw_printf
xorl %eax, %eax
addl $52, %esp
popl %esi
popl %edi
popl %ebx
ret
---------------------
Bye,
bearophile
More information about the digitalmars-d-ldc
mailing list