LDC 1.5-1.6 huge degradation of optimization
Igor Shirkalin
mathsoft at inbox.ru
Mon Nov 27 10:41:44 UTC 2017
Hello!
I have found that LDC1.5-1.6 generate unoptimized code in
contrast to LDC1.3-1.4 in some cases. I tried to extract the
example and make it as short as possible. The goal is to get the
compiled code with avx(2) instructions.
Here is the source of tst.d with comments to demonstrate the
problem.
// tst.d
import ldc.attributes;
// ldc1.3-1.4 generate higly optimized code with avx2 instructions
// ldc1.5-1.6 generate the code without any vector instructions
// the command line: ldc2 tst.d -m32 -O3 -release -output-s
alias Arr = ubyte[16][20]; // 20 of 16-ubyte vectors
import ldc.attributes;
@target("avx2") @nogc pure
auto distance(ref const Arr t1, ref const Arr t2)
{
int[20] res = void;
int sum;
foreach(t, ref r; res) {
int sv=0;
foreach(i; 0 .. 16) // the main cycle to be optimized with
avx2 instructions
sv += (t1[t][i]-t2[t][i])^^2;
r = sv;
// by uncommenting the following assignmet the avx2
optimization is turned on in ldc 1.6
// sum += sv;
}
return sum + res[10]; // returm some dummy sum
}
/* ldc1.3 (avx2 instructions are used)
LBB0_1:
vpmovzxbd -8(%ecx), %ymm0
vpmovzxbd -8(%eax), %ymm1
vpmovzxbd (%eax), %ymm2
addl $16, %eax
vpsubd %ymm1, %ymm0, %ymm0
vpmovzxbd (%ecx), %ymm1
addl $16, %ecx
vpmulld %ymm0, %ymm0, %ymm0
vpsubd %ymm2, %ymm1, %ymm1
vpmulld %ymm1, %ymm1, %ymm1
vpaddd %ymm0, %ymm1, %ymm0
vextracti128 $1, %ymm0, %xmm1
vpaddd %ymm1, %ymm0, %ymm0
vpshufd $78, %xmm0, %xmm1
vpaddd %ymm1, %ymm0, %ymm0
vphaddd %ymm0, %ymm0, %ymm0
vmovd %xmm0, (%esp,%edx,4)
incl %edx
cmpl $20, %edx
jb LBB0_1
*/
/* ldc1.6 (avx2 instructions aren't used)
LBB0_1:
movl %edx, (%esp)
movzbl -15(%ecx), %esi
movzbl -15(%eax), %edx
movzbl -14(%ecx), %edi
subl %edx, %esi
movzbl -14(%eax), %edx
imull %esi, %esi
... ; skipped
imull %ebp, %ebp
addl %ebp, %esi
movl 36(%esp), %ebp
imull %ebp, %ebp
addl %ebp, %esi
movl 32(%esp), %ebp
... ; skipped
imull %edx, %edx
addl %esi, %edx
movl (%esp), %esi
movl %edx, 48(%esp,%esi,4)
movl (%esp), %edx
incl %edx
cmpl $20, %edx
jb LBB0_1
*/
More information about the digitalmars-d-ldc
mailing list