# floor operation problem

bearophile bearophileHUGS at lycos.com
Wed Dec 18 16:15:27 PST 2013

```While I was debugging a performance problem, I have found the
cause is the floor operation. Below there is a small benchmark to
show it.

I have compiled the code with:

gcc -Ofast -std=c99 -s -flto -mfpmath=sse -ffast-math -msse3
test1.c -o test1
ldmd2 -O -release -inline -noboundscheck test2.d
ldmd2 -O -release -inline -noboundscheck test3.d

32 bit system

gcc version 4.8.0
LDC 0.12.1 (based on DMD v2.063.2 and LLVM 3.3.1)

Run-time, seconds:
test1.c: 1.01
test2.d: 4.14
test3.d: 6.62

---------------------

// test1.c
#include <stdio.h>
#include <math.h>
#include <stdlib.h>

static inline float foo(const float x, const float y) {
return floorf(x) + floorf(y);
}

int main() {
float total = 0.0f;

for (int i = 0; i < 1000; i++)
for (int y = 0; y < 256; y++)
for (int x = 0; x < 256; x++)
total += foo(x * 0.1f, y * 0.1f);

printf("%f\n", total);
return 0;
}

---------------------

// test2.d
import core.stdc.stdio, core.stdc.math;

float foo(const float x, const float y) nothrow {
return floorf(x) + floorf(y);
}

int main() {
float total = 0.0f;

for (int i = 0; i < 1000; i++)
for (int y = 0; y < 256; y++)
for (int x = 0; x < 256; x++)
total += foo(x * 0.1f, y * 0.1f);

printf("%f\n", total);
return 0;
}

---------------------

// test3.d
import core.stdc.stdio, std.math;

float foo(const float x, const float y) nothrow {
return floor(x) + floor(y);
}

int main() {
float total = 0.0f;

for (int i = 0; i < 1000; i++)
for (int y = 0; y < 256; y++)
for (int x = 0; x < 256; x++)
total += foo(x * 0.1f, y * 0.1f);

printf("%f\n", total);
return 0;
}

---------------------

test1.c asm:

_main:
pushl   %ebp
movl    %esp, %ebp
pushl   %ebx
movl    \$1000, %ebx
andl    \$-16, %esp
subl    \$16, %esp
call    ___main
xorps   %xmm1, %xmm1
movss   LC3, %xmm5
L2:
movss   LC1, %xmm6
xorps   %xmm3, %xmm3
xorl    %ecx, %ecx
.p2align 4,,7
L9:
movss   LC2, %xmm4
xorps   %xmm0, %xmm0
xorl    %eax, %eax
.p2align 4,,7
L7:
cmpl    \$256, %eax
je  L12
cvtsi2ss    %eax, %xmm0
mulss   %xmm6, %xmm0
movaps  %xmm0, %xmm2
andps   %xmm5, %xmm2
ucomiss %xmm2, %xmm4
jbe L7
cvttss2si   %xmm0, %edx
cvtsi2ss    %edx, %xmm2
movaps  %xmm2, %xmm7
cmpnless    %xmm0, %xmm7
movaps  %xmm7, %xmm0
movss   LC4, %xmm7
andps   %xmm7, %xmm0
subss   %xmm0, %xmm2
movaps  %xmm2, %xmm0
jmp L7
.p2align 4,,7
L12:
cmpl    \$256, %ecx
je  L5
cvtsi2ss    %ecx, %xmm3
movss   LC6, %xmm0
movss   LC2, %xmm2
mulss   LC1, %xmm3
andps   %xmm3, %xmm0
ucomiss %xmm0, %xmm2
jbe L9
cvttss2si   %xmm3, %eax
cvtsi2ss    %eax, %xmm0
movaps  %xmm0, %xmm2
cmpnless    %xmm3, %xmm2
movss   LC4, %xmm3
andps   %xmm3, %xmm2
movaps  %xmm0, %xmm3
subss   %xmm2, %xmm3
jmp L9
L5:
subl    \$1, %ebx
jne L2
unpcklps    %xmm1, %xmm1
movl    \$LC5, (%esp)
cvtps2pd    %xmm1, %xmm5
movsd   %xmm5, 4(%esp)
call    _printf
xorl    %eax, %eax
movl    -4(%ebp), %ebx
leave
ret

---------------------

test2.d asm:

__Dmain:
pushl	%ebx
pushl	%edi
pushl	%esi
subl	\$28, %esp
xorps	%xmm0, %xmm0
xorl	%esi, %esi
movss	LCPI1_0, %xmm1
.align	16, 0x90
LBB1_1:
xorl	%edi, %edi
.align	16, 0x90
LBB1_2:
xorps	%xmm2, %xmm2
cvtsi2ssl	%edi, %xmm2
mulss	%xmm1, %xmm2
movss	%xmm2, 12(%esp)
xorl	%ebx, %ebx
.align	16, 0x90
LBB1_3:
movss	%xmm0, 16(%esp)
xorps	%xmm0, %xmm0
cvtsi2ssl	%ebx, %xmm0
mulss	%xmm1, %xmm0
movss	%xmm0, (%esp)
calll	_floorf
movss	12(%esp), %xmm0
movss	%xmm0, (%esp)
fstps	24(%esp)
calll	_floorf
movss	LCPI1_0, %xmm1
fstps	20(%esp)
movss	24(%esp), %xmm0
movss	16(%esp), %xmm2
movss	%xmm2, 16(%esp)
movss	16(%esp), %xmm0
incl	%ebx
cmpl	\$256, %ebx
jne	LBB1_3
incl	%edi
cmpl	\$256, %edi
jne	LBB1_2
incl	%esi
cmpl	\$1000, %esi
jne	LBB1_1
cvtss2sd	%xmm0, %xmm0
movsd	%xmm0, 4(%esp)
movl	\$_.str, (%esp)
calll	___mingw_printf
xorl	%eax, %eax
popl	%esi
popl	%edi
popl	%ebx
ret

---------------------

test3.d asm:

__Dmain:
pushl   %ebx
pushl   %edi
pushl   %esi
subl    \$52, %esp
xorps   %xmm1, %xmm1
xorl    %esi, %esi
movss   LCPI1_0, %xmm2
.align  16, 0x90
LBB1_1:
xorl    %edi, %edi
.align  16, 0x90
LBB1_2:
xorps   %xmm0, %xmm0
cvtsi2ssl   %edi, %xmm0
mulss   %xmm2, %xmm0
movss   %xmm0, 48(%esp)
xorl    %ebx, %ebx
flds    48(%esp)
fstpt   12(%esp)
movaps  %xmm1, %xmm0
.align  16, 0x90
LBB1_3:
movss   %xmm0, 36(%esp)
xorps   %xmm0, %xmm0
cvtsi2ssl   %ebx, %xmm0
mulss   %xmm2, %xmm0
movss   %xmm0, 44(%esp)
flds    44(%esp)
fstpt   (%esp)
calll   __D3std4math5floorFNbNeeZe
subl    \$12, %esp
fstpt   24(%esp)
fldt    12(%esp)
fstpt   (%esp)
calll   __D3std4math5floorFNbNeeZe
subl    \$12, %esp
movss   36(%esp), %xmm0
movss   LCPI1_0, %xmm2
fldt    24(%esp)
fstps   40(%esp)
incl    %ebx
cmpl    \$256, %ebx
jne LBB1_3
movaps  %xmm0, %xmm1
incl    %edi
cmpl    \$256, %edi
jne LBB1_2
incl    %esi
cmpl    \$1000, %esi
jne LBB1_1
xorps   %xmm0, %xmm0
cvtss2sd    %xmm1, %xmm0
movsd   %xmm0, 4(%esp)
movl    \$_.str, (%esp)
calll   ___mingw_printf
xorl    %eax, %eax