floor operation problem
Marco Leise
Marco.Leise at gmx.de
Thu Dec 19 02:09:12 PST 2013
Am Thu, 19 Dec 2013 01:15:27 +0100
schrieb "bearophile" <bearophileHUGS at lycos.com>:
> While I was debugging a performance problem, I have found the
> cause is the floor operation. Below there is a small benchmark to
> show it.
>
> I have compiled the code with:
>
> gcc -Ofast -std=c99 -s -flto -mfpmath=sse -ffast-math -msse3
> test1.c -o test1
> ldmd2 -O -release -inline -noboundscheck test2.d
> ldmd2 -O -release -inline -noboundscheck test3.d
>
> 32 bit system
>
> gcc version 4.8.0
> LDC 0.12.1 (based on DMD v2.063.2 and LLVM 3.3.1)
>
> Run-time, seconds:
> test1.c: 1.01
> test2.d: 4.14
> test3.d: 6.62
>
> ---------------------
>
> // test1.c
> #include <stdio.h>
> #include <math.h>
> #include <stdlib.h>
>
> static inline float foo(const float x, const float y) {
> return floorf(x) + floorf(y);
> }
>
> int main() {
> float total = 0.0f;
>
> for (int i = 0; i < 1000; i++)
> for (int y = 0; y < 256; y++)
> for (int x = 0; x < 256; x++)
> total += foo(x * 0.1f, y * 0.1f);
>
> printf("%f\n", total);
> return 0;
> }
>
> ---------------------
>
> // test2.d
> import core.stdc.stdio, core.stdc.math;
>
> float foo(const float x, const float y) nothrow {
> return floorf(x) + floorf(y);
> }
>
> int main() {
> float total = 0.0f;
>
> for (int i = 0; i < 1000; i++)
> for (int y = 0; y < 256; y++)
> for (int x = 0; x < 256; x++)
> total += foo(x * 0.1f, y * 0.1f);
>
> printf("%f\n", total);
> return 0;
> }
>
> ---------------------
>
> // test3.d
> import core.stdc.stdio, std.math;
>
> float foo(const float x, const float y) nothrow {
> return floor(x) + floor(y);
> }
>
> int main() {
> float total = 0.0f;
>
> for (int i = 0; i < 1000; i++)
> for (int y = 0; y < 256; y++)
> for (int x = 0; x < 256; x++)
> total += foo(x * 0.1f, y * 0.1f);
>
> printf("%f\n", total);
> return 0;
> }
>
> ---------------------
>
> test1.c asm:
>
> _main:
> pushl %ebp
> movl %esp, %ebp
> pushl %ebx
> movl $1000, %ebx
> andl $-16, %esp
> subl $16, %esp
> call ___main
> xorps %xmm1, %xmm1
> movss LC3, %xmm5
> L2:
> movss LC1, %xmm6
> xorps %xmm3, %xmm3
> xorl %ecx, %ecx
> .p2align 4,,7
> L9:
> movss LC2, %xmm4
> xorps %xmm0, %xmm0
> xorl %eax, %eax
> .p2align 4,,7
> L7:
> addss %xmm0, %xmm1
> addl $1, %eax
> cmpl $256, %eax
> addss %xmm3, %xmm1
> je L12
> cvtsi2ss %eax, %xmm0
> mulss %xmm6, %xmm0
> movaps %xmm0, %xmm2
> andps %xmm5, %xmm2
> ucomiss %xmm2, %xmm4
> jbe L7
> cvttss2si %xmm0, %edx
> cvtsi2ss %edx, %xmm2
> movaps %xmm2, %xmm7
> cmpnless %xmm0, %xmm7
> movaps %xmm7, %xmm0
> movss LC4, %xmm7
> andps %xmm7, %xmm0
> subss %xmm0, %xmm2
> movaps %xmm2, %xmm0
> jmp L7
> .p2align 4,,7
> L12:
> addl $1, %ecx
> cmpl $256, %ecx
> je L5
> cvtsi2ss %ecx, %xmm3
> movss LC6, %xmm0
> movss LC2, %xmm2
> mulss LC1, %xmm3
> andps %xmm3, %xmm0
> ucomiss %xmm0, %xmm2
> jbe L9
> cvttss2si %xmm3, %eax
> cvtsi2ss %eax, %xmm0
> movaps %xmm0, %xmm2
> cmpnless %xmm3, %xmm2
> movss LC4, %xmm3
> andps %xmm3, %xmm2
> movaps %xmm0, %xmm3
> subss %xmm2, %xmm3
> jmp L9
> L5:
> subl $1, %ebx
> jne L2
> unpcklps %xmm1, %xmm1
> movl $LC5, (%esp)
> cvtps2pd %xmm1, %xmm5
> movsd %xmm5, 4(%esp)
> call _printf
> xorl %eax, %eax
> movl -4(%ebp), %ebx
> leave
> ret
>
> ---------------------
>
> test2.d asm:
>
> __Dmain:
> pushl %ebx
> pushl %edi
> pushl %esi
> subl $28, %esp
> xorps %xmm0, %xmm0
> xorl %esi, %esi
> movss LCPI1_0, %xmm1
> .align 16, 0x90
> LBB1_1:
> xorl %edi, %edi
> .align 16, 0x90
> LBB1_2:
> xorps %xmm2, %xmm2
> cvtsi2ssl %edi, %xmm2
> mulss %xmm1, %xmm2
> movss %xmm2, 12(%esp)
> xorl %ebx, %ebx
> .align 16, 0x90
> LBB1_3:
> movss %xmm0, 16(%esp)
> xorps %xmm0, %xmm0
> cvtsi2ssl %ebx, %xmm0
> mulss %xmm1, %xmm0
> movss %xmm0, (%esp)
> calll _floorf
> movss 12(%esp), %xmm0
> movss %xmm0, (%esp)
> fstps 24(%esp)
> calll _floorf
> movss LCPI1_0, %xmm1
> fstps 20(%esp)
> movss 24(%esp), %xmm0
> addss 20(%esp), %xmm0
> movss 16(%esp), %xmm2
> addss %xmm0, %xmm2
> movss %xmm2, 16(%esp)
> movss 16(%esp), %xmm0
> incl %ebx
> cmpl $256, %ebx
> jne LBB1_3
> incl %edi
> cmpl $256, %edi
> jne LBB1_2
> incl %esi
> cmpl $1000, %esi
> jne LBB1_1
> cvtss2sd %xmm0, %xmm0
> movsd %xmm0, 4(%esp)
> movl $_.str, (%esp)
> calll ___mingw_printf
> xorl %eax, %eax
> addl $28, %esp
> popl %esi
> popl %edi
> popl %ebx
> ret
>
> ---------------------
>
> test3.d asm:
>
> __Dmain:
> pushl %ebx
> pushl %edi
> pushl %esi
> subl $52, %esp
> xorps %xmm1, %xmm1
> xorl %esi, %esi
> movss LCPI1_0, %xmm2
> .align 16, 0x90
> LBB1_1:
> xorl %edi, %edi
> .align 16, 0x90
> LBB1_2:
> xorps %xmm0, %xmm0
> cvtsi2ssl %edi, %xmm0
> mulss %xmm2, %xmm0
> movss %xmm0, 48(%esp)
> xorl %ebx, %ebx
> flds 48(%esp)
> fstpt 12(%esp)
> movaps %xmm1, %xmm0
> .align 16, 0x90
> LBB1_3:
> movss %xmm0, 36(%esp)
> xorps %xmm0, %xmm0
> cvtsi2ssl %ebx, %xmm0
> mulss %xmm2, %xmm0
> movss %xmm0, 44(%esp)
> flds 44(%esp)
> fstpt (%esp)
> calll __D3std4math5floorFNbNeeZe
> subl $12, %esp
> fstpt 24(%esp)
> fldt 12(%esp)
> fstpt (%esp)
> calll __D3std4math5floorFNbNeeZe
> subl $12, %esp
> movss 36(%esp), %xmm0
> movss LCPI1_0, %xmm2
> fldt 24(%esp)
> faddp %st(1)
> fstps 40(%esp)
> addss 40(%esp), %xmm0
> incl %ebx
> cmpl $256, %ebx
> jne LBB1_3
> movaps %xmm0, %xmm1
> incl %edi
> cmpl $256, %edi
> jne LBB1_2
> incl %esi
> cmpl $1000, %esi
> jne LBB1_1
> xorps %xmm0, %xmm0
> cvtss2sd %xmm1, %xmm0
> movsd %xmm0, 4(%esp)
> movl $_.str, (%esp)
> calll ___mingw_printf
> xorl %eax, %eax
> addl $52, %esp
> popl %esi
> popl %edi
> popl %ebx
> ret
>
> ---------------------
>
> Bye,
> bearophile
but... fast-math isn't kosher
--
Marco
More information about the digitalmars-d-ldc
mailing list