Optimizing for SIMD: best practices?(i.e. what features are allowed?)

Sun Mar 7 13:26:37 UTC 2021

On Thursday, 25 February 2021 at 11:28:14 UTC, z wrote:
> ...
It seems that using static foreach with pointer parameters 
exclusively is the best way to "guide" LDC into optimizing 
code.(using arr1[] += arr2[] syntax resulted in worse performance 
for me.)
However, AVX512 support seems limited to being able to use the 16 
other YMM registers, rather than using the same code template but 
changed to use ZMM registers and double the offsets to take 
advantage of the new size.
Compiled with «-g -enable-unsafe-fp-math -enable-no-infs-fp-math 
-ffast-math -O -release -mcpu=skylake» :
>__gshared simdf init = [0f,0f,0f,0f,0f,0f,0f,0f];
>alias simdf = float[8]
>extern(C)//with extern(D)(the default), the assembly output uses 
>one register for two pointers.
>void vEUCLIDpsptr_void(simdf* a0, simdf* a1, simdf* a2, simdf* 
>a3, simdf* b1, simdf* b2, simdf* b3) {
>	simdf amm0 = init;//returned simdf
>	simdf amm1 = *a1;
>	simdf amm2 = *a2;
>	simdf amm3 = *a3;
>	static foreach(size_t i; 0..simdlength) {
>		//Needs to be interleaved like this, otherwise LDC generates 
>worse code.
>		amm1[i] -= (*b1)[i];
>		amm1[i] *= amm1[i];
>		amm2[i] -= (*b2)[i];
>		amm2[i] *= amm2[i];
>		amm3[i] -= (*b3)[i];
>		amm3[i] *= amm3[i];
>		amm0[i] += amm1[i];
>		amm0[i] += amm2[i];
>		amm0[i] += amm3[i];
>		amm0[i] = sqrt(amm0[i]);
>	}
>	*a0 = amm0;
>	return;
>}

>mov r10,qword ptr ss:[rsp+38]
>mov r11,qword ptr ss:[rsp+30]
>mov rax,qword ptr ss:[rsp+28]
>vmovups ymm0,yword ptr ds:[rdx]
>vmovups ymm1,yword ptr ds:[r8]
>vsubps ymm0,ymm0,yword ptr ds:[rax]
>vmovups ymm2,yword ptr ds:[r9]
>vfmadd213ps ymm0,ymm0,yword ptr ds:[<_D12euclideandst4initG8f>]
>vsubps ymm1,ymm1,yword ptr ds:[r11]
>vfmadd213ps ymm1,ymm1,ymm0
>vsubps ymm0,ymm2,yword ptr ds:[r10]
>vfmadd213ps ymm0,ymm0,ymm1
>vsqrtps ymm0,ymm0
>vmovups yword ptr ds:[rcx],ymm0
>vzeroupper ret

The speed difference is near 400% for the same amount of 
distances compared with the single distance function example.
However, the assembly generated isn't the fastest, for example 
removing vzeroupper and using the unused and known-zeroed YMM15 
register as a zero-filled register operand for the first 
vfmadd213ps instruction improves performance by 10%(70 vs 78ms 
for 256 million distances...)
The function can then be improved further to use pointer offsets 
and more registers, this is more efficient and results in a 500%~ 
improvement :
>extern(C)
>void vEUCLIDpsptr_void_40(simdf* a0, simdf* a1, simdf* a2, 
>simdf* a3, simdf* b1, simdf* b2, simdf* b3) {
>	simdf amm0 = init;
>	simdf amm1 = *a1;
>	simdf amm2 = *a2;
>	simdf amm3 = *a3;
>	simdf emm0 = init;
>	simdf emm1 = amm1;
>	simdf emm2 = amm2;//mirror AMM for positions
>	simdf emm3 = amm3;
>	simdf imm0 = init;
>	simdf imm1 = emm1;
>	simdf imm2 = emm2;
>	simdf imm3 = emm3;
>	simdf omm0 = init;
>	simdf omm1 = emm1;
>	simdf omm2 = emm2;
>	simdf omm3 = emm3;
>	simdf umm0 = init;
>	simdf umm1 = omm1;
>	simdf umm2 = omm2;
>	simdf umm3 = omm3;
>	//cascading assignment may not be the fastest way, especially 
>compared to just loading from the pointer!
>
>	static foreach(size_t i; 0..simdlength) {
>		amm1[i] -= (b1[0])[i];
>		amm1[i] *= amm1[i];
>		amm0[i] += amm1[i];
>
>		amm2[i] -= (b2[0])[i];
>		amm2[i] *= amm2[i];
>		amm0[i] += amm2[i];
>
>		amm3[i] -= (b3[0])[i];
>		amm3[i] *= amm3[i];
>		amm0[i] += amm3[i];
>
>		amm0[i] = sqrt(amm0[i]);
>		//template
>		emm1[i] -= (b1[1])[i];
>		emm1[i] *= emm1[i];
>		emm0[i] += emm1[i];
>
>		emm2[i] -= (b2[1])[i];
>		emm2[i] *= emm2[i];
>		emm0[i] += emm2[i];
>
>		emm3[i] -= (b3[1])[i];
>		emm3[i] *= emm3[i];
>		emm0[i] += emm3[i];
>
>		emm0[i] = sqrt(emm0[i]);
>		//
>		imm1[i] -= (b1[2])[i];
>		imm1[i] *= imm1[i];
>		imm0[i] += imm1[i];
>
>		imm2[i] -= (b2[2])[i];
>		imm2[i] *= imm2[i];
>		imm0[i] += imm2[i];
>
>		imm3[i] -= (b3[2])[i];
>		imm3[i] *= imm3[i];
>		imm0[i] += imm3[i];
>
>		imm0[i] = sqrt(imm0[i]);
>		//
>		omm1[i] -= (b1[3])[i];
>		omm1[i] *= omm1[i];
>		omm0[i] += omm1[i];
>
>		omm2[i] -= (b2[3])[i];
>		omm2[i] *= omm2[i];
>		omm0[i] += omm2[i];
>
>		omm3[i] -= (b3[3])[i];
>		omm3[i] *= omm3[i];
>		omm0[i] += omm3[i];
>
>		omm0[i] = sqrt(omm0[i]);
>		//
>		umm1[i] -= (b1[4])[i];
>		umm1[i] *= umm1[i];
>		umm0[i] += umm1[i];
>
>		umm2[i] -= (b2[4])[i];
>		umm2[i] *= umm2[i];
>		umm0[i] += umm2[i];
>
>		umm3[i] -= (b3[4])[i];
>		umm3[i] *= umm3[i];
>		umm0[i] += umm3[i];
>
>		umm0[i] = sqrt(umm0[i]);
>
>	}
>
>	//Sadly, writing to the arrays from within the static foreach 
>causes LDC to not use SIMD at all
>	*a0 = amm0;
>	*(a0+1) = emm0;
>	*(a0+2) = imm0;
>	*(a0+3) = omm0;
>	*(a0+4) = umm0;
>	return;
>
>}

>sub rsp,38
>vmovaps xmmword ptr ss:[rsp+20],xmm8
>vmovaps xmmword ptr ss:[rsp+10],xmm7
>vmovaps xmmword ptr ss:[rsp],xmm6
>mov r10,qword ptr ss:[rsp+70]
>mov r11,qword ptr ss:[rsp+68]
>mov rax,qword ptr ss:[rsp+60]
>vmovaps ymm1,yword ptr ds:[<_D12euclideandst4initG8f>]
>vmovups ymm3,yword ptr ds:[rdx]
>vmovups ymm2,yword ptr ds:[r8]
>vmovups ymm0,yword ptr ds:[r9]
>vsubps ymm4,ymm3,yword ptr ds:[rax]
>vfmadd213ps ymm4,ymm4,ymm1
>vsubps ymm5,ymm2,yword ptr ds:[r11]
>vfmadd213ps ymm5,ymm5,ymm4
>vsubps ymm4,ymm0,yword ptr ds:[r10]
>vfmadd213ps ymm4,ymm4,ymm5
>vsqrtps ymm4,ymm4
>vsubps ymm5,ymm3,yword ptr ds:[rax+20]
>vfmadd213ps ymm5,ymm5,ymm1
>vsubps ymm6,ymm2,yword ptr ds:[r11+20]
>vfmadd213ps ymm6,ymm6,ymm5
>vsubps ymm5,ymm0,yword ptr ds:[r10+20]
>vfmadd213ps ymm5,ymm5,ymm6
>vsqrtps ymm5,ymm5
>vsubps ymm6,ymm3,yword ptr ds:[rax+40]
>vsubps ymm7,ymm2,yword ptr ds:[r11+40]
>vfmadd213ps ymm6,ymm6,ymm1
>vfmadd213ps ymm7,ymm7,ymm6
>vsubps ymm6,ymm0,yword ptr ds:[r10+40]
>vfmadd213ps ymm6,ymm6,ymm7
>vsqrtps ymm6,ymm6
>vsubps ymm7,ymm3,yword ptr ds:[rax+60]
>vfmadd213ps ymm7,ymm7,ymm1
>vsubps ymm8,ymm2,yword ptr ds:[r11+60]
>vfmadd213ps ymm8,ymm8,ymm7
>vsubps ymm7,ymm0,yword ptr ds:[r10+60]
>vfmadd213ps ymm7,ymm7,ymm8
>vsqrtps ymm7,ymm7
>vsubps ymm3,ymm3,yword ptr ds:[rax+80]
>vfmadd213ps ymm3,ymm3,ymm1
>vsubps ymm1,ymm2,yword ptr ds:[r11+80]
>vfmadd213ps ymm1,ymm1,ymm3
>vsubps ymm0,ymm0,yword ptr ds:[r10+80]
>vfmadd213ps ymm0,ymm0,ymm1
>vsqrtps ymm0,ymm0
>vmovups yword ptr ds:[rcx],ymm4
>vmovups yword ptr ds:[rcx+20],ymm5
>vmovups yword ptr ds:[rcx+40],ymm6
>vmovups yword ptr ds:[rcx+60],ymm7
>vmovups yword ptr ds:[rcx+80],ymm0
>vmovaps xmm6,xmmword ptr ss:[rsp]
>vmovaps xmm7,xmmword ptr ss:[rsp+10]
>vmovaps xmm8,xmmword ptr ss:[rsp+20]
>add rsp,38
>vzeroupper ret

This is slightly more efficient(54 vs 78ms for the same amount of 
distances(256 million)) but still far from perfect, also i do not 
know what it is doing with XMM registers knowing i can safely 
remove the instructions with a debugger and it doesn't crash or 
corrupt data.(sometimes it was doing this for 0x100 bytes)