floor operation problem

Thu Dec 19 02:09:12 PST 2013

Am Thu, 19 Dec 2013 01:15:27 +0100
schrieb "bearophile" <bearophileHUGS at lycos.com>:

> While I was debugging a performance problem, I have found the 
> cause is the floor operation. Below there is a small benchmark to 
> show it.
> 
> I have compiled the code with:
> 
> gcc -Ofast -std=c99 -s -flto -mfpmath=sse -ffast-math -msse3 
> test1.c -o test1
> ldmd2 -O -release -inline -noboundscheck test2.d
> ldmd2 -O -release -inline -noboundscheck test3.d
> 
> 32 bit system
> 
> gcc version 4.8.0
> LDC 0.12.1 (based on DMD v2.063.2 and LLVM 3.3.1)
> 
> Run-time, seconds:
> test1.c: 1.01
> test2.d: 4.14
> test3.d: 6.62
> 
> ---------------------
> 
> // test1.c
> #include <stdio.h>
> #include <math.h>
> #include <stdlib.h>
> 
> static inline float foo(const float x, const float y) {
>      return floorf(x) + floorf(y);
> }
> 
> int main() {
>      float total = 0.0f;
> 
>      for (int i = 0; i < 1000; i++)
>          for (int y = 0; y < 256; y++)
>              for (int x = 0; x < 256; x++)
>                  total += foo(x * 0.1f, y * 0.1f);
> 
>      printf("%f\n", total);
>      return 0;
> }
> 
> ---------------------
> 
> // test2.d
> import core.stdc.stdio, core.stdc.math;
> 
> float foo(const float x, const float y) nothrow {
>      return floorf(x) + floorf(y);
> }
> 
> int main() {
>      float total = 0.0f;
> 
>      for (int i = 0; i < 1000; i++)
>          for (int y = 0; y < 256; y++)
>              for (int x = 0; x < 256; x++)
>                  total += foo(x * 0.1f, y * 0.1f);
> 
>      printf("%f\n", total);
>      return 0;
> }
> 
> ---------------------
> 
> // test3.d
> import core.stdc.stdio, std.math;
> 
> float foo(const float x, const float y) nothrow {
>      return floor(x) + floor(y);
> }
> 
> int main() {
>      float total = 0.0f;
> 
>      for (int i = 0; i < 1000; i++)
>          for (int y = 0; y < 256; y++)
>              for (int x = 0; x < 256; x++)
>                  total += foo(x * 0.1f, y * 0.1f);
> 
>      printf("%f\n", total);
>      return 0;
> }
> 
> ---------------------
> 
> test1.c asm:
> 
> _main:
>      pushl   %ebp
>      movl    %esp, %ebp
>      pushl   %ebx
>      movl    $1000, %ebx
>      andl    $-16, %esp
>      subl    $16, %esp
>      call    ___main
>      xorps   %xmm1, %xmm1
>      movss   LC3, %xmm5
> L2:
>      movss   LC1, %xmm6
>      xorps   %xmm3, %xmm3
>      xorl    %ecx, %ecx
>      .p2align 4,,7
> L9:
>      movss   LC2, %xmm4
>      xorps   %xmm0, %xmm0
>      xorl    %eax, %eax
>      .p2align 4,,7
> L7:
>      addss   %xmm0, %xmm1
>      addl    $1, %eax
>      cmpl    $256, %eax
>      addss   %xmm3, %xmm1
>      je  L12
>      cvtsi2ss    %eax, %xmm0
>      mulss   %xmm6, %xmm0
>      movaps  %xmm0, %xmm2
>      andps   %xmm5, %xmm2
>      ucomiss %xmm2, %xmm4
>      jbe L7
>      cvttss2si   %xmm0, %edx
>      cvtsi2ss    %edx, %xmm2
>      movaps  %xmm2, %xmm7
>      cmpnless    %xmm0, %xmm7
>      movaps  %xmm7, %xmm0
>      movss   LC4, %xmm7
>      andps   %xmm7, %xmm0
>      subss   %xmm0, %xmm2
>      movaps  %xmm2, %xmm0
>      jmp L7
>      .p2align 4,,7
> L12:
>      addl    $1, %ecx
>      cmpl    $256, %ecx
>      je  L5
>      cvtsi2ss    %ecx, %xmm3
>      movss   LC6, %xmm0
>      movss   LC2, %xmm2
>      mulss   LC1, %xmm3
>      andps   %xmm3, %xmm0
>      ucomiss %xmm0, %xmm2
>      jbe L9
>      cvttss2si   %xmm3, %eax
>      cvtsi2ss    %eax, %xmm0
>      movaps  %xmm0, %xmm2
>      cmpnless    %xmm3, %xmm2
>      movss   LC4, %xmm3
>      andps   %xmm3, %xmm2
>      movaps  %xmm0, %xmm3
>      subss   %xmm2, %xmm3
>      jmp L9
> L5:
>      subl    $1, %ebx
>      jne L2
>      unpcklps    %xmm1, %xmm1
>      movl    $LC5, (%esp)
>      cvtps2pd    %xmm1, %xmm5
>      movsd   %xmm5, 4(%esp)
>      call    _printf
>      xorl    %eax, %eax
>      movl    -4(%ebp), %ebx
>      leave
>      ret
> 
> ---------------------
> 
> test2.d asm:
> 
> __Dmain:
> 	pushl	%ebx
> 	pushl	%edi
> 	pushl	%esi
> 	subl	$28, %esp
> 	xorps	%xmm0, %xmm0
> 	xorl	%esi, %esi
> 	movss	LCPI1_0, %xmm1
> 	.align	16, 0x90
> LBB1_1:
> 	xorl	%edi, %edi
> 	.align	16, 0x90
> LBB1_2:
> 	xorps	%xmm2, %xmm2
> 	cvtsi2ssl	%edi, %xmm2
> 	mulss	%xmm1, %xmm2
> 	movss	%xmm2, 12(%esp)
> 	xorl	%ebx, %ebx
> 	.align	16, 0x90
> LBB1_3:
> 	movss	%xmm0, 16(%esp)
> 	xorps	%xmm0, %xmm0
> 	cvtsi2ssl	%ebx, %xmm0
> 	mulss	%xmm1, %xmm0
> 	movss	%xmm0, (%esp)
> 	calll	_floorf
> 	movss	12(%esp), %xmm0
> 	movss	%xmm0, (%esp)
> 	fstps	24(%esp)
> 	calll	_floorf
> 	movss	LCPI1_0, %xmm1
> 	fstps	20(%esp)
> 	movss	24(%esp), %xmm0
> 	addss	20(%esp), %xmm0
> 	movss	16(%esp), %xmm2
> 	addss	%xmm0, %xmm2
> 	movss	%xmm2, 16(%esp)
> 	movss	16(%esp), %xmm0
> 	incl	%ebx
> 	cmpl	$256, %ebx
> 	jne	LBB1_3
> 	incl	%edi
> 	cmpl	$256, %edi
> 	jne	LBB1_2
> 	incl	%esi
> 	cmpl	$1000, %esi
> 	jne	LBB1_1
> 	cvtss2sd	%xmm0, %xmm0
> 	movsd	%xmm0, 4(%esp)
> 	movl	$_.str, (%esp)
> 	calll	___mingw_printf
> 	xorl	%eax, %eax
> 	addl	$28, %esp
> 	popl	%esi
> 	popl	%edi
> 	popl	%ebx
> 	ret
> 
> ---------------------
> 
> test3.d asm:
> 
> __Dmain:
>      pushl   %ebx
>      pushl   %edi
>      pushl   %esi
>      subl    $52, %esp
>      xorps   %xmm1, %xmm1
>      xorl    %esi, %esi
>      movss   LCPI1_0, %xmm2
>      .align  16, 0x90
> LBB1_1:
>      xorl    %edi, %edi
>      .align  16, 0x90
> LBB1_2:
>      xorps   %xmm0, %xmm0
>      cvtsi2ssl   %edi, %xmm0
>      mulss   %xmm2, %xmm0
>      movss   %xmm0, 48(%esp)
>      xorl    %ebx, %ebx
>      flds    48(%esp)
>      fstpt   12(%esp)
>      movaps  %xmm1, %xmm0
>      .align  16, 0x90
> LBB1_3:
>      movss   %xmm0, 36(%esp)
>      xorps   %xmm0, %xmm0
>      cvtsi2ssl   %ebx, %xmm0
>      mulss   %xmm2, %xmm0
>      movss   %xmm0, 44(%esp)
>      flds    44(%esp)
>      fstpt   (%esp)
>      calll   __D3std4math5floorFNbNeeZe
>      subl    $12, %esp
>      fstpt   24(%esp)
>      fldt    12(%esp)
>      fstpt   (%esp)
>      calll   __D3std4math5floorFNbNeeZe
>      subl    $12, %esp
>      movss   36(%esp), %xmm0
>      movss   LCPI1_0, %xmm2
>      fldt    24(%esp)
>      faddp   %st(1)
>      fstps   40(%esp)
>      addss   40(%esp), %xmm0
>      incl    %ebx
>      cmpl    $256, %ebx
>      jne LBB1_3
>      movaps  %xmm0, %xmm1
>      incl    %edi
>      cmpl    $256, %edi
>      jne LBB1_2
>      incl    %esi
>      cmpl    $1000, %esi
>      jne LBB1_1
>      xorps   %xmm0, %xmm0
>      cvtss2sd    %xmm1, %xmm0
>      movsd   %xmm0, 4(%esp)
>      movl    $_.str, (%esp)
>      calll   ___mingw_printf
>      xorl    %eax, %eax
>      addl    $52, %esp
>      popl    %esi
>      popl    %edi
>      popl    %ebx
>      ret
> 
> ---------------------
> 
> Bye,
> bearophile

but... fast-math isn't kosher

-- 
Marco