Optimizing for SIMD: best practices?(i.e. what features are allowed?)
z
z at z.com
Sun Mar 7 13:26:37 UTC 2021
On Thursday, 25 February 2021 at 11:28:14 UTC, z wrote:
> ...
It seems that using static foreach with pointer parameters
exclusively is the best way to "guide" LDC into optimizing
code.(using arr1[] += arr2[] syntax resulted in worse performance
for me.)
However, AVX512 support seems limited to being able to use the 16
other YMM registers, rather than using the same code template but
changed to use ZMM registers and double the offsets to take
advantage of the new size.
Compiled with «-g -enable-unsafe-fp-math -enable-no-infs-fp-math
-ffast-math -O -release -mcpu=skylake» :
>__gshared simdf init = [0f,0f,0f,0f,0f,0f,0f,0f];
>alias simdf = float[8]
>extern(C)//with extern(D)(the default), the assembly output uses
>one register for two pointers.
>void vEUCLIDpsptr_void(simdf* a0, simdf* a1, simdf* a2, simdf*
>a3, simdf* b1, simdf* b2, simdf* b3) {
> simdf amm0 = init;//returned simdf
> simdf amm1 = *a1;
> simdf amm2 = *a2;
> simdf amm3 = *a3;
> static foreach(size_t i; 0..simdlength) {
> //Needs to be interleaved like this, otherwise LDC generates
>worse code.
> amm1[i] -= (*b1)[i];
> amm1[i] *= amm1[i];
> amm2[i] -= (*b2)[i];
> amm2[i] *= amm2[i];
> amm3[i] -= (*b3)[i];
> amm3[i] *= amm3[i];
> amm0[i] += amm1[i];
> amm0[i] += amm2[i];
> amm0[i] += amm3[i];
> amm0[i] = sqrt(amm0[i]);
> }
> *a0 = amm0;
> return;
>}
>mov r10,qword ptr ss:[rsp+38]
>mov r11,qword ptr ss:[rsp+30]
>mov rax,qword ptr ss:[rsp+28]
>vmovups ymm0,yword ptr ds:[rdx]
>vmovups ymm1,yword ptr ds:[r8]
>vsubps ymm0,ymm0,yword ptr ds:[rax]
>vmovups ymm2,yword ptr ds:[r9]
>vfmadd213ps ymm0,ymm0,yword ptr ds:[<_D12euclideandst4initG8f>]
>vsubps ymm1,ymm1,yword ptr ds:[r11]
>vfmadd213ps ymm1,ymm1,ymm0
>vsubps ymm0,ymm2,yword ptr ds:[r10]
>vfmadd213ps ymm0,ymm0,ymm1
>vsqrtps ymm0,ymm0
>vmovups yword ptr ds:[rcx],ymm0
>vzeroupper ret
The speed difference is near 400% for the same amount of
distances compared with the single distance function example.
However, the assembly generated isn't the fastest, for example
removing vzeroupper and using the unused and known-zeroed YMM15
register as a zero-filled register operand for the first
vfmadd213ps instruction improves performance by 10%(70 vs 78ms
for 256 million distances...)
The function can then be improved further to use pointer offsets
and more registers, this is more efficient and results in a 500%~
improvement :
>extern(C)
>void vEUCLIDpsptr_void_40(simdf* a0, simdf* a1, simdf* a2,
>simdf* a3, simdf* b1, simdf* b2, simdf* b3) {
> simdf amm0 = init;
> simdf amm1 = *a1;
> simdf amm2 = *a2;
> simdf amm3 = *a3;
> simdf emm0 = init;
> simdf emm1 = amm1;
> simdf emm2 = amm2;//mirror AMM for positions
> simdf emm3 = amm3;
> simdf imm0 = init;
> simdf imm1 = emm1;
> simdf imm2 = emm2;
> simdf imm3 = emm3;
> simdf omm0 = init;
> simdf omm1 = emm1;
> simdf omm2 = emm2;
> simdf omm3 = emm3;
> simdf umm0 = init;
> simdf umm1 = omm1;
> simdf umm2 = omm2;
> simdf umm3 = omm3;
> //cascading assignment may not be the fastest way, especially
>compared to just loading from the pointer!
>
> static foreach(size_t i; 0..simdlength) {
> amm1[i] -= (b1[0])[i];
> amm1[i] *= amm1[i];
> amm0[i] += amm1[i];
>
> amm2[i] -= (b2[0])[i];
> amm2[i] *= amm2[i];
> amm0[i] += amm2[i];
>
> amm3[i] -= (b3[0])[i];
> amm3[i] *= amm3[i];
> amm0[i] += amm3[i];
>
> amm0[i] = sqrt(amm0[i]);
> //template
> emm1[i] -= (b1[1])[i];
> emm1[i] *= emm1[i];
> emm0[i] += emm1[i];
>
> emm2[i] -= (b2[1])[i];
> emm2[i] *= emm2[i];
> emm0[i] += emm2[i];
>
> emm3[i] -= (b3[1])[i];
> emm3[i] *= emm3[i];
> emm0[i] += emm3[i];
>
> emm0[i] = sqrt(emm0[i]);
> //
> imm1[i] -= (b1[2])[i];
> imm1[i] *= imm1[i];
> imm0[i] += imm1[i];
>
> imm2[i] -= (b2[2])[i];
> imm2[i] *= imm2[i];
> imm0[i] += imm2[i];
>
> imm3[i] -= (b3[2])[i];
> imm3[i] *= imm3[i];
> imm0[i] += imm3[i];
>
> imm0[i] = sqrt(imm0[i]);
> //
> omm1[i] -= (b1[3])[i];
> omm1[i] *= omm1[i];
> omm0[i] += omm1[i];
>
> omm2[i] -= (b2[3])[i];
> omm2[i] *= omm2[i];
> omm0[i] += omm2[i];
>
> omm3[i] -= (b3[3])[i];
> omm3[i] *= omm3[i];
> omm0[i] += omm3[i];
>
> omm0[i] = sqrt(omm0[i]);
> //
> umm1[i] -= (b1[4])[i];
> umm1[i] *= umm1[i];
> umm0[i] += umm1[i];
>
> umm2[i] -= (b2[4])[i];
> umm2[i] *= umm2[i];
> umm0[i] += umm2[i];
>
> umm3[i] -= (b3[4])[i];
> umm3[i] *= umm3[i];
> umm0[i] += umm3[i];
>
> umm0[i] = sqrt(umm0[i]);
>
> }
>
> //Sadly, writing to the arrays from within the static foreach
>causes LDC to not use SIMD at all
> *a0 = amm0;
> *(a0+1) = emm0;
> *(a0+2) = imm0;
> *(a0+3) = omm0;
> *(a0+4) = umm0;
> return;
>
>}
>sub rsp,38
>vmovaps xmmword ptr ss:[rsp+20],xmm8
>vmovaps xmmword ptr ss:[rsp+10],xmm7
>vmovaps xmmword ptr ss:[rsp],xmm6
>mov r10,qword ptr ss:[rsp+70]
>mov r11,qword ptr ss:[rsp+68]
>mov rax,qword ptr ss:[rsp+60]
>vmovaps ymm1,yword ptr ds:[<_D12euclideandst4initG8f>]
>vmovups ymm3,yword ptr ds:[rdx]
>vmovups ymm2,yword ptr ds:[r8]
>vmovups ymm0,yword ptr ds:[r9]
>vsubps ymm4,ymm3,yword ptr ds:[rax]
>vfmadd213ps ymm4,ymm4,ymm1
>vsubps ymm5,ymm2,yword ptr ds:[r11]
>vfmadd213ps ymm5,ymm5,ymm4
>vsubps ymm4,ymm0,yword ptr ds:[r10]
>vfmadd213ps ymm4,ymm4,ymm5
>vsqrtps ymm4,ymm4
>vsubps ymm5,ymm3,yword ptr ds:[rax+20]
>vfmadd213ps ymm5,ymm5,ymm1
>vsubps ymm6,ymm2,yword ptr ds:[r11+20]
>vfmadd213ps ymm6,ymm6,ymm5
>vsubps ymm5,ymm0,yword ptr ds:[r10+20]
>vfmadd213ps ymm5,ymm5,ymm6
>vsqrtps ymm5,ymm5
>vsubps ymm6,ymm3,yword ptr ds:[rax+40]
>vsubps ymm7,ymm2,yword ptr ds:[r11+40]
>vfmadd213ps ymm6,ymm6,ymm1
>vfmadd213ps ymm7,ymm7,ymm6
>vsubps ymm6,ymm0,yword ptr ds:[r10+40]
>vfmadd213ps ymm6,ymm6,ymm7
>vsqrtps ymm6,ymm6
>vsubps ymm7,ymm3,yword ptr ds:[rax+60]
>vfmadd213ps ymm7,ymm7,ymm1
>vsubps ymm8,ymm2,yword ptr ds:[r11+60]
>vfmadd213ps ymm8,ymm8,ymm7
>vsubps ymm7,ymm0,yword ptr ds:[r10+60]
>vfmadd213ps ymm7,ymm7,ymm8
>vsqrtps ymm7,ymm7
>vsubps ymm3,ymm3,yword ptr ds:[rax+80]
>vfmadd213ps ymm3,ymm3,ymm1
>vsubps ymm1,ymm2,yword ptr ds:[r11+80]
>vfmadd213ps ymm1,ymm1,ymm3
>vsubps ymm0,ymm0,yword ptr ds:[r10+80]
>vfmadd213ps ymm0,ymm0,ymm1
>vsqrtps ymm0,ymm0
>vmovups yword ptr ds:[rcx],ymm4
>vmovups yword ptr ds:[rcx+20],ymm5
>vmovups yword ptr ds:[rcx+40],ymm6
>vmovups yword ptr ds:[rcx+60],ymm7
>vmovups yword ptr ds:[rcx+80],ymm0
>vmovaps xmm6,xmmword ptr ss:[rsp]
>vmovaps xmm7,xmmword ptr ss:[rsp+10]
>vmovaps xmm8,xmmword ptr ss:[rsp+20]
>add rsp,38
>vzeroupper ret
This is slightly more efficient(54 vs 78ms for the same amount of
distances(256 million)) but still far from perfect, also i do not
know what it is doing with XMM registers knowing i can safely
remove the instructions with a debugger and it doesn't crash or
corrupt data.(sometimes it was doing this for 0x100 bytes)
More information about the Digitalmars-d-learn
mailing list