LDC 1.5-1.6 huge degradation of optimization

Mon Nov 27 10:41:44 UTC 2017

Hello!

I have found that LDC1.5-1.6 generate unoptimized code in 
contrast to LDC1.3-1.4 in some cases. I tried to extract the 
example and make it as short as possible. The goal is to get the 
compiled code with avx(2) instructions.

Here is the source of tst.d with comments to demonstrate the 
problem.

// tst.d

import ldc.attributes;

// ldc1.3-1.4 generate higly optimized code with avx2 instructions
// ldc1.5-1.6 generate the code without any vector instructions
// the command line: ldc2 tst.d -m32 -O3 -release -output-s

alias Arr = ubyte[16][20]; // 20 of 16-ubyte vectors

import ldc.attributes;

@target("avx2") @nogc pure
auto distance(ref const Arr t1, ref const Arr t2)
{	
		int[20] res = void;
		int sum;
		foreach(t, ref r; res) {
			int sv=0;
			foreach(i; 0 .. 16) // the main cycle to be optimized with 
avx2 instructions
				sv += (t1[t][i]-t2[t][i])^^2;
			r = sv;
			// by uncommenting the following assignmet the avx2 
optimization is turned on in ldc 1.6
			// sum += sv;
		}
		return sum + res[10]; // returm some dummy sum
}

/* ldc1.3 (avx2 instructions are used)
LBB0_1:
	vpmovzxbd	-8(%ecx), %ymm0
	vpmovzxbd	-8(%eax), %ymm1
	vpmovzxbd	(%eax), %ymm2
	addl	$16, %eax
	vpsubd	%ymm1, %ymm0, %ymm0
	vpmovzxbd	(%ecx), %ymm1
	addl	$16, %ecx
	vpmulld	%ymm0, %ymm0, %ymm0
	vpsubd	%ymm2, %ymm1, %ymm1
	vpmulld	%ymm1, %ymm1, %ymm1
	vpaddd	%ymm0, %ymm1, %ymm0
	vextracti128	$1, %ymm0, %xmm1
	vpaddd	%ymm1, %ymm0, %ymm0
	vpshufd	$78, %xmm0, %xmm1
	vpaddd	%ymm1, %ymm0, %ymm0
	vphaddd	%ymm0, %ymm0, %ymm0
	vmovd	%xmm0, (%esp,%edx,4)
	incl	%edx
	cmpl	$20, %edx
	jb	LBB0_1
*/

/* ldc1.6 (avx2 instructions aren't used)
LBB0_1:
	movl	%edx, (%esp)
	movzbl	-15(%ecx), %esi
	movzbl	-15(%eax), %edx
	movzbl	-14(%ecx), %edi
	subl	%edx, %esi
	movzbl	-14(%eax), %edx
	imull	%esi, %esi

	... ; skipped

	imull	%ebp, %ebp
	addl	%ebp, %esi
	movl	36(%esp), %ebp
	imull	%ebp, %ebp
	addl	%ebp, %esi
	movl	32(%esp), %ebp

	... ; skipped

	imull	%edx, %edx
	addl	%esi, %edx
	movl	(%esp), %esi
	movl	%edx, 48(%esp,%esi,4)
	movl	(%esp), %edx
	incl	%edx
	cmpl	$20, %edx
	jb	LBB0_1
*/