inline functions
bearophile
bearophileHUGS at lycos.com
Sat Mar 26 12:47:03 PDT 2011
Caligo:
> There shouldn't be a performance difference between the two, but there.
It seems the compiler isn't removing some useless code (the first has 3 groups of movsd, the second has 4 of them):
------------
v = v * 1.00000012;
main:
L45: mov ESI,offset FLAT:_D4test6Vector6__initZ
lea EDI,068h[ESP]
movsd
movsd
movsd
movsd
movsd
movsd
fld qword ptr 010h[ESP]
fld qword ptr 018h[ESP]
fxch ST1
fmul qword ptr FLAT:_DATA[018h]
lea ESI,068h[ESP]
lea EDI,048h[ESP]
fxch ST1
fmul qword ptr FLAT:_DATA[018h]
fld qword ptr 8[ESP]
fmul qword ptr FLAT:_DATA[018h]
fxch ST2
fstp qword ptr 080h[ESP]
fxch ST1
fld qword ptr 080h[ESP]
fxch ST2
fstp qword ptr 088h[ESP]
fxch ST1
fld qword ptr 088h[ESP]
fxch ST2
fstp qword ptr 068h[ESP]
fstp qword ptr 070h[ESP]
fstp qword ptr 078h[ESP]
movsd
movsd
movsd
movsd
movsd
movsd
lea ESI,048h[ESP]
lea EDI,8[ESP]
movsd
movsd
movsd
movsd
movsd
movsd
inc EAX
cmp EAX,03938700h
jb L45
-----------------------------
v = 1.00000012 * v;
main:
L45: mov ESI,offset FLAT:_D4test6Vector6__initZ
lea EDI,088h[ESP]
movsd
movsd
movsd
movsd
movsd
movsd
fld qword ptr 010h[ESP]
fld qword ptr 018h[ESP]
fxch ST1
fmul qword ptr FLAT:_DATA[018h]
lea ESI,088h[ESP]
fxch ST1
fmul qword ptr FLAT:_DATA[018h]
fld qword ptr 8[ESP]
fxch ST2
lea EDI,068h[ESP]
fxch ST2
fmul qword ptr FLAT:_DATA[018h]
fxch ST2
fstp qword ptr 0A0h[ESP]
fxch ST1
fld qword ptr 0A0h[ESP]
fxch ST2
fstp qword ptr 0A8h[ESP]
fxch ST1
fld qword ptr 0A8h[ESP]
fxch ST2
fstp qword ptr 088h[ESP]
fstp qword ptr 090h[ESP]
fstp qword ptr 098h[ESP]
movsd
movsd
movsd
movsd
movsd
movsd
lea ESI,068h[ESP]
lea EDI,048h[ESP]
movsd
movsd
movsd
movsd
movsd
movsd
lea ESI,048h[ESP]
lea EDI,8[ESP]
movsd
movsd
movsd
movsd
movsd
movsd
inc EAX
cmp EAX,03938700h
jb L45
-----------------
v.x *= 1.00000012; v.y *= 1.00000012; v.z *= 1.00000012;
L42: fld qword ptr FLAT:_DATA[018h]
inc EAX
cmp EAX,03938700h
fmul qword ptr 8[ESP]
fstp qword ptr 8[ESP]
fld qword ptr FLAT:_DATA[018h]
fmul qword ptr 010h[ESP]
fstp qword ptr 010h[ESP]
fld qword ptr FLAT:_DATA[018h]
fmul qword ptr 018h[ESP]
fstp qword ptr 018h[ESP]
jb L42
-----------------
C GCC uses only 5 instructions/loop, to improve this :
v.x *= 1.00000012; v.y *= 1.00000012; v.z *= 1.00000012;
L2:
fmul %st, %st(3)
subl $1, %eax
fmul %st, %st(2)
fmul %st, %st(1)
jne L2
-----------------
C GCC, -mfpmath=sse -msse3
v.x *= 1.00000012; v.y *= 1.00000012; v.z *= 1.00000012;
L2:
subl $1, %eax
mulsd %xmm0, %xmm1
mulsd %xmm0, %xmm2
mulsd %xmm0, %xmm3
jne L2
-----------------
C GCC, -mfpmath=sse -msse3 -funroll-loops
L2:
subl $8, %eax
mulsd %xmm0, %xmm1
mulsd %xmm0, %xmm2
mulsd %xmm0, %xmm3
mulsd %xmm0, %xmm1
mulsd %xmm0, %xmm2
mulsd %xmm0, %xmm3
mulsd %xmm0, %xmm1
mulsd %xmm0, %xmm2
mulsd %xmm0, %xmm3
mulsd %xmm0, %xmm1
mulsd %xmm0, %xmm2
mulsd %xmm0, %xmm3
mulsd %xmm0, %xmm1
mulsd %xmm0, %xmm2
mulsd %xmm0, %xmm3
mulsd %xmm0, %xmm1
mulsd %xmm0, %xmm2
mulsd %xmm0, %xmm3
mulsd %xmm0, %xmm1
mulsd %xmm0, %xmm2
mulsd %xmm0, %xmm3
mulsd %xmm0, %xmm1
mulsd %xmm0, %xmm2
mulsd %xmm0, %xmm3
jne L2
I have not found a quick way to let GCC vectorize this code, using two multiplications with one SSE instructions, I am not sure GCC is able to do this automatically.
Bye,
bearophile
More information about the Digitalmars-d-learn
mailing list