D-specific optimisation: opCmp
John Colvin via digitalmars-d-ldc
digitalmars-d-ldc at puremagic.com
Wed Jan 13 09:55:24 PST 2016
opCmp is a pretty roundabout way of dealing with comparisons from
a computational point of view and optimisers seem quite bad at
dealing with it. For example, based on
https://github.com/D-Programming-Language/phobos/pull/3927 :
% cat comparisons.d
float opCmp(float a, float b)
{
return a < b ? -1 : a > b ? +1 : a == b ? 0 : float.nan;
}
int gt(float a, float b) {
return opCmp(a, b) > 0;
}
int gte(float a, float b) {
return opCmp(a, b) >= 0;
}
int lt(float a, float b) {
return opCmp(a, b) < 0;
}
int lte(float a, float b) {
return opCmp(a, b) <= 0;
}
int gt_direct(float a, float b) {
return a > b;
}
int gte_direct(float a, float b) {
return a >= b;
}
int lt_direct(float a, float b) {
return a < b;
}
int lte_direct(float a, float b) {
return a <= b;
}
% ldmd2 -O -inline -release -output-s comparisons.d
% cat comparisons.s
.section __TEXT,__text,regular,pure_instructions
.section __TEXT,__literal4,4byte_literals
.align 2
LCPI0_0:
.long 3212836864
LCPI0_1:
.long 1065353216
LCPI0_2:
.long 2143289344
.section __TEXT,__text,regular,pure_instructions
.globl __D11comparisons5opCmpFffZf
.align 4, 0x90
__D11comparisons5opCmpFffZf:
.cfi_startproc
ucomiss %xmm1, %xmm0
jbe LBB0_2
movss LCPI0_0(%rip), %xmm1
movaps %xmm1, %xmm0
retq
LBB0_2:
ucomiss %xmm0, %xmm1
jbe LBB0_5
movss LCPI0_1(%rip), %xmm1
movaps %xmm1, %xmm0
retq
LBB0_5:
cmpeqss %xmm0, %xmm1
movss LCPI0_2(%rip), %xmm0
andnps %xmm0, %xmm1
movaps %xmm1, %xmm0
retq
.cfi_endproc
.globl __D11comparisons2gtFffZi
.align 4, 0x90
__D11comparisons2gtFffZi:
.cfi_startproc
xorl %eax, %eax
ucomiss %xmm1, %xmm0
ja LBB1_3
movl $1, %eax
ucomiss %xmm0, %xmm1
ja LBB1_3
xorl %eax, %eax
LBB1_3:
retq
.cfi_endproc
.globl __D11comparisons3gteFffZi
.align 4, 0x90
__D11comparisons3gteFffZi:
.cfi_startproc
ucomiss %xmm1, %xmm0
jbe LBB2_2
xorl %eax, %eax
movzbl %al, %eax
retq
LBB2_2:
ucomiss %xmm0, %xmm1
setae %al
movzbl %al, %eax
retq
.cfi_endproc
.globl __D11comparisons2ltFffZi
.align 4, 0x90
__D11comparisons2ltFffZi:
.cfi_startproc
ucomiss %xmm1, %xmm0
seta %al
movzbl %al, %eax
retq
.cfi_endproc
.globl __D11comparisons3lteFffZi
.align 4, 0x90
__D11comparisons3lteFffZi:
.cfi_startproc
movb $1, %al
ucomiss %xmm1, %xmm0
ja LBB4_2
cmpeqss %xmm0, %xmm1
movd %xmm1, %eax
andl $1, %eax
LBB4_2:
movzbl %al, %eax
retq
.cfi_endproc
.globl __D11comparisons9gt_directFffZi
.align 4, 0x90
__D11comparisons9gt_directFffZi:
.cfi_startproc
ucomiss %xmm0, %xmm1
seta %al
movzbl %al, %eax
retq
.cfi_endproc
.globl __D11comparisons10gte_directFffZi
.align 4, 0x90
__D11comparisons10gte_directFffZi:
.cfi_startproc
ucomiss %xmm0, %xmm1
setae %al
movzbl %al, %eax
retq
.cfi_endproc
.globl __D11comparisons9lt_directFffZi
.align 4, 0x90
__D11comparisons9lt_directFffZi:
.cfi_startproc
ucomiss %xmm1, %xmm0
seta %al
movzbl %al, %eax
retq
.cfi_endproc
.globl __D11comparisons10lte_directFffZi
.align 4, 0x90
__D11comparisons10lte_directFffZi:
.cfi_startproc
ucomiss %xmm1, %xmm0
setae %al
movzbl %al, %eax
retq
.cfi_endproc
See how much better the code-gen is for the direct
implementations? It would be great if LDC was somehow able to get
this right.
More information about the digitalmars-d-ldc
mailing list