DConf 2013 Day 3 Talk 5: Effective SIMD for modern architectures by Manu Evans
bearophile
bearophileHUGS at lycos.com
Fri Jul 12 13:28:10 PDT 2013
Manu:
> This is interesting. I didn't know about this.
I have taken a look at this page:
https://github.com/ispc/ispc
There is a free compiler binary for various operating systems:
http://ispc.github.io/downloads.html
I have tried the Windows compiler on some of the given examples
of code, and it works! And the resulting asm is excellent. Normal
compilers for the usual languages aren't able to do produce not
even nearly as good asm.
To try the code of the examples I compile like this:
ispc.exe --emit-asm stencil.ispc -o stencil.s
Or even like this to see the AVX2 asm instructions:
ispc.exe --target=avx2 --emit-asm stencil.ispc -o stencil.s
As example it compiles a function like this:
static void
stencil_step(uniform int x0, uniform int x1,
uniform int y0, uniform int y1,
uniform int z0, uniform int z1,
uniform int Nx, uniform int Ny, uniform int Nz,
uniform const float coef[4], uniform const float
vsq[],
uniform const float Ain[], uniform float Aout[]) {
const uniform int Nxy = Nx * Ny;
foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) {
int index = (z * Nxy) + (y * Nx) + x;
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) *
Nxy)]
float div = coef[0] * A_cur(0, 0, 0) +
coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
A_cur(0, +1, 0) + A_cur(0, -1, 0) +
A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
A_cur(0, +2, 0) + A_cur(0, -2, 0) +
A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
A_cur(0, +3, 0) + A_cur(0, -3, 0) +
A_cur(0, 0, +3) + A_cur(0, 0, -3));
A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
vsq[index] * div;
}
}
To asm like (using SSE4):
pshufd $0, %xmm7, %xmm9 # xmm9 = xmm7[0,0,0,0]
cmpl 156(%rsp), %r8d # 4-byte Folded Reload
movups (%r14,%rdi), %xmm0
mulps %xmm6, %xmm5
pshufd $0, %xmm2, %xmm2 # xmm2 = xmm2[0,0,0,0]
movq 424(%rsp), %rdx
movups (%rdx,%r15), %xmm6
mulps %xmm3, %xmm2
pshufd $0, %xmm10, %xmm3 # xmm3 = xmm10[0,0,0,0]
mulps %xmm11, %xmm3
addps %xmm2, %xmm3
addps %xmm5, %xmm3
addps %xmm4, %xmm0
movslq %eax, %rax
movups (%r14,%rax), %xmm2
addps %xmm0, %xmm2
movslq %ebp, %rax
movups (%r14,%rax), %xmm0
addps %xmm2, %xmm0
mulps %xmm9, %xmm0
addps %xmm3, %xmm0
mulps %xmm6, %xmm0
addps %xmm1, %xmm0
movups %xmm0, (%r11,%r15)
jl .LBB0_5
.LBB0_6: # %partial_inner_all_outer
# in Loop: Header=BB0_4
Depth=2
movq %r13, %rbp
movl 156(%rsp), %r13d # 4-byte Reload
movq 424(%rsp), %r9
cmpl 64(%rsp), %r8d # 4-byte Folded Reload
movq %r11, %rbx
jge .LBB0_257
# BB#7: # %partial_inner_only
# in Loop: Header=BB0_4
Depth=2
movd %r8d, %xmm0
movl 84(%rsp), %r15d # 4-byte Reload
imull 400(%rsp), %r15d
pshufd $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0]
paddd .LCPI0_1(%rip), %xmm0
movdqa %xmm8, %xmm1
pcmpgtd %xmm0, %xmm1
movmskps %xmm1, %edi
addl 68(%rsp), %r15d # 4-byte Folded Reload
leal (%r8,%r15), %r10d
Or using AVX2 (this not exactly the equievalent piece of code)
(The asm generates with AVX2 is usually significant shorter):
movslq %edx, %rdx
vaddps (%rax,%rdx), %ymm5, %ymm5
movl 52(%rsp), %edx # 4-byte Reload
leal (%rdx,%r15), %edx
movslq %edx, %rdx
vaddps (%rax,%rdx), %ymm5, %ymm5
vaddps %ymm11, %ymm6, %ymm9
vaddps %ymm10, %ymm8, %ymm10
vmovups (%rax,%rdi), %ymm12
addl $32, %r15d
vmovups (%r11,%rcx), %ymm6
movq 400(%rsp), %rdx
vbroadcastss 12(%rdx), %ymm7
vbroadcastss 8(%rdx), %ymm8
vbroadcastss (%rdx), %ymm11
vaddps %ymm12, %ymm10, %ymm10
cmpl 48(%rsp), %r14d # 4-byte Folded Reload
vbroadcastss 4(%rdx), %ymm12
vmulps %ymm9, %ymm12, %ymm9
vfmadd213ps %ymm9, %ymm3, %ymm11
vfmadd213ps %ymm11, %ymm8, %ymm10
vfmadd213ps %ymm10, %ymm5, %ymm7
vfmadd213ps %ymm4, %ymm6, %ymm7
vmovups %ymm7, (%r8,%rcx)
jl .LBB0_8
# BB#4: #
%partial_inner_all_outer.us
# in Loop: Header=BB0_7
Depth=2
cmpl 40(%rsp), %r14d # 4-byte Folded Reload
movq 400(%rsp), %r10
jge .LBB0_6
# BB#5: # %partial_inner_only.us
# in Loop: Header=BB0_7
Depth=2
vmovd %r14d, %xmm3
vbroadcastss %xmm3, %ymm3
movl 100(%rsp), %ecx # 4-byte Reload
leal (%rcx,%r15), %ecx
vpaddd %ymm2, %ymm3, %ymm3
Sometimes it gives performance warnings:
rt.ispc:257:9: Performance Warning: Scatter required to store
value.
image[offset] = ray.maxt;
^^^^^^^^^^^^^
rt.ispc:258:9: Performance Warning: Scatter required to store
value.
id[offset] = ray.hitId;
^^^^^^^^^^
A a bit larger example with this function from a little
ray-tracer:
static bool TriIntersect(const uniform Triangle &tri, Ray &ray) {
uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
uniform float3 e1 = p1 - p0;
uniform float3 e2 = p2 - p0;
float3 s1 = Cross(ray.dir, e2);
float divisor = Dot(s1, e1);
bool hit = true;
if (divisor == 0.)
hit = false;
float invDivisor = 1.f / divisor;
// Compute first barycentric coordinate
float3 d = ray.origin - p0;
float b1 = Dot(d, s1) * invDivisor;
if (b1 < 0. || b1 > 1.)
hit = false;
// Compute second barycentric coordinate
float3 s2 = Cross(d, e1);
float b2 = Dot(ray.dir, s2) * invDivisor;
if (b2 < 0. || b1 + b2 > 1.)
hit = false;
// Compute _t_ to intersection point
float t = Dot(e2, s2) * invDivisor;
if (t < ray.mint || t > ray.maxt)
hit = false;
if (hit) {
ray.maxt = t;
ray.hitId = tri.id;
}
return hit;
}
The (more or less) complete asm with AVX2 for that function:
"TriIntersect___REFs[_c_unTriangle]REFs[vyRay]": #
@"TriIntersect___REFs[_c_unTriangle]REFs[vyRay]"
# BB#0: # %allocas
subq $248, %rsp
vmovaps %xmm15, 80(%rsp) # 16-byte Spill
vmovaps %xmm14, 96(%rsp) # 16-byte Spill
vmovaps %xmm13, 112(%rsp) # 16-byte Spill
vmovaps %xmm12, 128(%rsp) # 16-byte Spill
vmovaps %xmm11, 144(%rsp) # 16-byte Spill
vmovaps %xmm10, 160(%rsp) # 16-byte Spill
vmovaps %xmm9, 176(%rsp) # 16-byte Spill
vmovaps %xmm8, 192(%rsp) # 16-byte Spill
vmovaps %xmm7, 208(%rsp) # 16-byte Spill
vmovaps %xmm6, 224(%rsp) # 16-byte Spill
vmovss (%rcx), %xmm0
vmovss 16(%rcx), %xmm2
vinsertps $16, 4(%rcx), %xmm0, %xmm0
vinsertps $32, 8(%rcx), %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm10
vmovaps (%rdx), %ymm0
vmovaps 32(%rdx), %ymm3
vmovaps 64(%rdx), %ymm1
vmovaps 96(%rdx), %ymm9
vpbroadcastd .LCPI1_0(%rip), %ymm12
vxorps %ymm4, %ymm4, %ymm4
vpermps %ymm10, %ymm4, %ymm4
vsubps %ymm4, %ymm0, %ymm0
vpermps %ymm10, %ymm12, %ymm4
vsubps %ymm4, %ymm3, %ymm13
vmovups %ymm13, (%rsp) # 32-byte Folded Spill
vpbroadcastd .LCPI1_1(%rip), %ymm8
vpermps %ymm10, %ymm8, %ymm3
vsubps %ymm3, %ymm1, %ymm6
vmovss 32(%rcx), %xmm1
vinsertps $16, 36(%rcx), %xmm1, %xmm1
vinsertps $32, 40(%rcx), %xmm1, %xmm1
vinsertf128 $1, %xmm0, %ymm1, %ymm1
vsubps %ymm10, %ymm1, %ymm15
vbroadcastss %xmm15, %ymm3
vmovups %ymm3, 32(%rsp) # 32-byte Folded Spill
vmovaps 128(%rdx), %ymm11
vpermps %ymm15, %ymm12, %ymm5
vmulps %ymm3, %ymm11, %ymm1
vmovaps %ymm3, %ymm4
vmovaps %ymm5, %ymm7
vfmsub213ps %ymm1, %ymm9, %ymm7
vinsertps $16, 20(%rcx), %xmm2, %xmm1
vinsertps $32, 24(%rcx), %xmm1, %xmm1
vinsertf128 $1, %xmm0, %ymm1, %ymm1
vsubps %ymm10, %ymm1, %ymm1
vpermps %ymm1, %ymm12, %ymm14
vpermps %ymm1, %ymm8, %ymm12
vmulps %ymm6, %ymm14, %ymm3
vmovaps %ymm13, %ymm2
vfmsub213ps %ymm3, %ymm12, %ymm2
vbroadcastss %xmm1, %ymm1
vmulps %ymm0, %ymm12, %ymm3
vmovaps %ymm6, %ymm13
vfmsub213ps %ymm3, %ymm1, %ymm13
vmulps %ymm13, %ymm11, %ymm3
vmovaps %ymm2, %ymm10
vfmadd213ps %ymm3, %ymm9, %ymm10
vpermps %ymm15, %ymm8, %ymm8
vmulps %ymm8, %ymm9, %ymm3
vmovaps 160(%rdx), %ymm9
vmulps %ymm5, %ymm9, %ymm15
vfmsub213ps %ymm15, %ymm8, %ymm11
vmovaps %ymm4, %ymm15
vfmsub213ps %ymm3, %ymm9, %ymm15
vmulps %ymm15, %ymm14, %ymm4
vmovaps %ymm11, %ymm3
vfmadd213ps %ymm4, %ymm1, %ymm3
vfmadd213ps %ymm3, %ymm7, %ymm12
vmovups (%rsp), %ymm4 # 32-byte Folded Reload
vmulps %ymm4, %ymm15, %ymm3
vfmadd213ps %ymm3, %ymm0, %ymm11
vfmadd213ps %ymm11, %ymm6, %ymm7
vmulps %ymm4, %ymm1, %ymm1
vfmsub213ps %ymm1, %ymm14, %ymm0
vbroadcastss .LCPI1_2(%rip), %ymm1
vrcpps %ymm12, %ymm3
vmovaps %ymm12, %ymm4
vfnmadd213ps %ymm1, %ymm3, %ymm4
vmulps %ymm4, %ymm3, %ymm4
vxorps %ymm14, %ymm14, %ymm14
vcmpeqps %ymm14, %ymm12, %ymm1
vcmpunordps %ymm14, %ymm12, %ymm3
vorps %ymm1, %ymm3, %ymm11
vmulps %ymm13, %ymm5, %ymm1
vmulps %ymm4, %ymm7, %ymm5
vmovups 32(%rsp), %ymm3 # 32-byte Folded Reload
vfmadd213ps %ymm1, %ymm3, %ymm2
vbroadcastss .LCPI1_3(%rip), %ymm1
vcmpnleps %ymm1, %ymm5, %ymm3
vcmpnleps %ymm5, %ymm14, %ymm6
vorps %ymm3, %ymm6, %ymm6
vbroadcastss .LCPI1_0(%rip), %ymm3
vblendvps %ymm11, %ymm14, %ymm3, %ymm7
vfmadd213ps %ymm10, %ymm0, %ymm9
vmovaps (%r8), %ymm3
vmovmskps %ymm3, %eax
leaq 352(%rdx), %r8
cmpl $255, %eax
vmulps %ymm9, %ymm4, %ymm9
vaddps %ymm9, %ymm5, %ymm10
vmovups 320(%rdx), %ymm5
vfmadd213ps %ymm2, %ymm8, %ymm0
vblendvps %ymm6, %ymm14, %ymm7, %ymm6
vpcmpeqd %ymm2, %ymm2, %ymm2
vcmpnleps %ymm1, %ymm10, %ymm1
vcmpnleps %ymm9, %ymm14, %ymm7
vorps %ymm1, %ymm7, %ymm1
vblendvps %ymm1, %ymm14, %ymm6, %ymm6
vmulps %ymm0, %ymm4, %ymm1
vcmpnleps 352(%rdx), %ymm1, %ymm0
vcmpnleps %ymm1, %ymm5, %ymm4
vorps %ymm0, %ymm4, %ymm0
vblendvps %ymm0, %ymm14, %ymm6, %ymm0
vpcmpeqd %ymm14, %ymm0, %ymm4
vpxor %ymm2, %ymm4, %ymm2
je .LBB1_1
# BB#4: # %some_on
vpand %ymm3, %ymm2, %ymm2
.LBB1_1: # %all_on
vmovmskps %ymm2, %eax
testl %eax, %eax
je .LBB1_3
# BB#2: # %safe_if_run_true356
vmaskmovps %ymm1, %ymm2, (%r8)
vpbroadcastd 48(%rcx), %ymm1
vmaskmovps %ymm1, %ymm2, 384(%rdx)
.LBB1_3: # %safe_if_after_true
vmovaps 224(%rsp), %xmm6 # 16-byte Reload
vmovaps 208(%rsp), %xmm7 # 16-byte Reload
vmovaps 192(%rsp), %xmm8 # 16-byte Reload
vmovaps 176(%rsp), %xmm9 # 16-byte Reload
vmovaps 160(%rsp), %xmm10 # 16-byte Reload
vmovaps 144(%rsp), %xmm11 # 16-byte Reload
vmovaps 128(%rsp), %xmm12 # 16-byte Reload
vmovaps 112(%rsp), %xmm13 # 16-byte Reload
vmovaps 96(%rsp), %xmm14 # 16-byte Reload
vmovaps 80(%rsp), %xmm15 # 16-byte Reload
addq $248, %rsp
ret
Using SSE4 the function asm starts like this:
"TriIntersect___REFs[_c_unTriangle]REFs[vyRay]": #
@"TriIntersect___REFs[_c_unTriangle]REFs[vyRay]"
# BB#0: # %allocas
subq $248, %rsp
movaps %xmm15, 80(%rsp) # 16-byte Spill
movaps %xmm14, 96(%rsp) # 16-byte Spill
movaps %xmm13, 112(%rsp) # 16-byte Spill
movaps %xmm12, 128(%rsp) # 16-byte Spill
movaps %xmm11, 144(%rsp) # 16-byte Spill
movaps %xmm10, 160(%rsp) # 16-byte Spill
movaps %xmm9, 176(%rsp) # 16-byte Spill
movaps %xmm8, 192(%rsp) # 16-byte Spill
movaps %xmm7, 208(%rsp) # 16-byte Spill
movaps %xmm6, 224(%rsp) # 16-byte Spill
movss (%rcx), %xmm1
movss 16(%rcx), %xmm0
insertps $16, 4(%rcx), %xmm1
insertps $32, 8(%rcx), %xmm1
movss 32(%rcx), %xmm7
insertps $16, 36(%rcx), %xmm7
insertps $32, 40(%rcx), %xmm7
subps %xmm1, %xmm7
pshufd $-86, %xmm1, %xmm2 # xmm2 = xmm1[2,2,2,2]
pshufd $85, %xmm1, %xmm3 # xmm3 = xmm1[1,1,1,1]
movaps (%rdx), %xmm4
movaps 16(%rdx), %xmm12
movaps 32(%rdx), %xmm5
movaps 48(%rdx), %xmm11
subps %xmm3, %xmm12
subps %xmm2, %xmm5
movaps %xmm5, 32(%rsp) # 16-byte Spill
pshufd $0, %xmm1, %xmm2 # xmm2 = xmm1[0,0,0,0]
subps %xmm2, %xmm4
pshufd $85, %xmm7, %xmm3 # xmm3 = xmm7[1,1,1,1]
movdqa %xmm3, 48(%rsp) # 16-byte Spill
movaps %xmm11, %xmm2
mulps %xmm3, %xmm2
movaps %xmm3, %xmm6
pshufd $-86, %xmm7, %xmm3 # xmm3 = xmm7[2,2,2,2]
movdqa %xmm3, 64(%rsp) # 16-byte Spill
movaps %xmm11, %xmm9
mulps %xmm3, %xmm9
movaps %xmm3, %xmm10
insertps $16, 20(%rcx), %xmm0
insertps $32, 24(%rcx), %xmm0
subps %xmm1, %xmm0
pshufd $85, %xmm0, %xmm3 # xmm3 = xmm0[1,1,1,1]
movdqa %xmm3, (%rsp) # 16-byte Spill
movdqa %xmm3, %xmm1
movdqa %xmm3, %xmm14
mulps %xmm5, %xmm1
pshufd $-86, %xmm0, %xmm8 # xmm8 = xmm0[2,2,2,2]
...
Even LDC2 compiler doesn't get anywhere close to such
good/efficient usage of SIMD instructions. And the compiler is
also able to spread the work on multiple cores. I think D is
meant to be used for similar numerical code too, so perhaps the
little amount of ideas contained in this very C-like language is
worth stealing and adding to D.
Bye,
bearophile
More information about the Digitalmars-d-announce
mailing list