__restrict, architecture intrinsics vs asm, consoles, and other

Wed Sep 21 20:24:49 PDT 2011

How would one do something like this without intrinsics (the code is c++ using 
gcc vector extensions): 

template <class V>
struct Fft 
{
  typedef typename V::T T;
  typedef typename V::vec vec;
  static const int VecSize = V::Size;

...

  template <int Interleaved>
  static NOINLINE void fft_pass_interleaved(
    vec * __restrict pr, 
    vec *__restrict pi, 
    vec *__restrict pend, 
    T *__restrict table)  
  {
    for(; pr < pend; pr += 2, pi += 2, table += 2*Interleaved)
    {
      vec tmpr, ti, ur, ui, wr, wi;
      V::template expandComplexArrayToRealImagVec<Interleaved>(table, wr, wi);
      V::template deinterleave<Interleaved>(pr[0],pr[1], ur, tmpr);
      V::template deinterleave<Interleaved>(pi[0],pi[1], ui, ti);
      vec tr = tmpr*wr - ti*wi;
      ti = tmpr*wi + ti*wr;
      V::template interleave<Interleaved>(ur + tr, ur - tr, pr[0], pr[1]);
      V::template interleave<Interleaved>(ui + ti, ui - ti, pi[0], pi[1]);
    }
  }

...

Here vector elements need to be shuffled around when they are loaded and stored. 
This is platform dependent and cannot be expressed through vector operations 
(or gcc vector extensions).  Here I abstracted platform dependent functionality 
in member functions of  V, which are implemented using intrinsics.  The assembly 
generated for SSE single precision and Interleaved=4 is:

 0000000000000000 <_ZN3FftI6SSEVecIfEE20fft_pass_interleavedILi4EEEvPDv4_fS5_S5_Pf>:
   0:	48 39 d7             	cmp    %rdx,%rdi
   3:	0f 83 9c 00 00 00    	jae    a5 <_ZN3FftI6SSEVecIfEE20fft_pass_interleavedILi4EEEvPDv4_fS5_S5_Pf+0xa5>
   9:	0f 1f 80 00 00 00 00 	nopl   0x0(%rax)
  10:	0f 28 19             	movaps (%rcx),%xmm3
  13:	0f 28 41 10          	movaps 0x10(%rcx),%xmm0
  17:	48 83 c1 20          	add    $0x20,%rcx
  1b:	0f 28 f3             	movaps %xmm3,%xmm6
  1e:	0f 28 2f             	movaps (%rdi),%xmm5
  21:	0f c6 d8 dd          	shufps $0xdd,%xmm0,%xmm3
  25:	0f c6 f0 88          	shufps $0x88,%xmm0,%xmm6
  29:	0f 28 e5             	movaps %xmm5,%xmm4
  2c:	0f 28 47 10          	movaps 0x10(%rdi),%xmm0
  30:	0f 28 4e 10          	movaps 0x10(%rsi),%xmm1
  34:	0f c6 e0 88          	shufps $0x88,%xmm0,%xmm4
  38:	0f c6 e8 dd          	shufps $0xdd,%xmm0,%xmm5
  3c:	0f 28 06             	movaps (%rsi),%xmm0
  3f:	0f 28 d0             	movaps %xmm0,%xmm2
  42:	0f c6 c1 dd          	shufps $0xdd,%xmm1,%xmm0
  46:	0f c6 d1 88          	shufps $0x88,%xmm1,%xmm2
  4a:	0f 28 cd             	movaps %xmm5,%xmm1
  4d:	0f 28 f8             	movaps %xmm0,%xmm7
  50:	0f 59 ce             	mulps  %xmm6,%xmm1
  53:	0f 59 fb             	mulps  %xmm3,%xmm7
  56:	0f 59 c6             	mulps  %xmm6,%xmm0
  59:	0f 59 dd             	mulps  %xmm5,%xmm3
  5c:	0f 5c cf             	subps  %xmm7,%xmm1
  5f:	0f 58 c3             	addps  %xmm3,%xmm0
  62:	0f 28 dc             	movaps %xmm4,%xmm3
  65:	0f 5c d9             	subps  %xmm1,%xmm3
  68:	0f 58 cc             	addps  %xmm4,%xmm1
  6b:	0f 28 e1             	movaps %xmm1,%xmm4
  6e:	0f 15 cb             	unpckhps %xmm3,%xmm1
  71:	0f 14 e3             	unpcklps %xmm3,%xmm4
  74:	0f 29 4f 10          	movaps %xmm1,0x10(%rdi)
  78:	0f 28 ca             	movaps %xmm2,%xmm1
  7b:	0f 29 27             	movaps %xmm4,(%rdi)
  7e:	0f 5c c8             	subps  %xmm0,%xmm1
  81:	48 83 c7 20          	add    $0x20,%rdi
  85:	0f 58 c2             	addps  %xmm2,%xmm0
  88:	0f 28 d0             	movaps %xmm0,%xmm2
  8b:	0f 15 c1             	unpckhps %xmm1,%xmm0
  8e:	0f 14 d1             	unpcklps %xmm1,%xmm2
  91:	0f 29 46 10          	movaps %xmm0,0x10(%rsi)
  95:	0f 29 16             	movaps %xmm2,(%rsi)
  98:	48 83 c6 20          	add    $0x20,%rsi
  9c:	48 39 fa             	cmp    %rdi,%rdx
  9f:	0f 87 6b ff ff ff    	ja     10 <_ZN3FftI6SSEVecIfEE20fft_pass_interleavedILi4EEEvPDv4_fS5_S5_Pf+0x10>
  a5:	f3 c3                	repz retq 

Would something like that be possible with D inline assembly or would there be 
additional loads and stores for each call of V::interleave, V::deinterleave 
and V::expandComplexArrayToRealImagVec?