Inline Assembler rox

Sun Jan 22 02:38:34 PST 2012

Porting some code from C to D I found the inline assembler very  
convenient. This is the C code (using an external NASM file):

	// dot_product returns dot product t*w of n elements.  n is rounded
	// up to a multiple of 8.  Result is scaled down by 8 bits.
	#ifdef NOASM  // no assembly language
	int dot_product(short *t, short *w, int n) {
	  int sum=0;
	  n=(n+7)&-8;
	  for (int i=0; i<n; i+=2) {
	if (lol >= 21567) printf("dp %d %d %d %d %d %d\n", n, i, t[i], w[i],  
t[i+1], w[i+1]);
	    sum+=(t[i]*w[i]+t[i+1]*w[i+1]) >> 8;
	  }
	  return sum;
	}
	#else  // The NASM version uses MMX and is about 8 times faster.
	extern "C" int dot_product(short *t, short *w, int n);  // in NASM
	#endif

In D, I can move the ASM inside the function, so there is no need for two  
declarations:

	extern (C) int dot_product(short *t, short *w, const int n) {
	    version (D_InlineAsm_X86_64) asm {
	        naked;
	        mov RCX, RDX;            // n
	        mov RAX, RDI;            // a
	        mov RDX, RSI;            // b
	        cmp RCX, 0;
	        jz done;
	        sub RAX, 16;
	        sub RDX, 16;
	        pxor XMM0, XMM0;         // sum = 0
	    loop:                        // each loop sums 4 products
	        movdqa XMM1, [RAX+RCX*2];// put parital sums of vector product in  
xmm1
	        pmaddwd XMM1, [RDX+RCX*2];
	        psrad XMM1, 8;
	        paddd XMM0, XMM1;
	        sub RCX, 8;
	        ja loop;
	        movdqa XMM1, XMM0;       // add 4 parts of xmm0 and return in eax
	        psrldq XMM1, 8;
	        paddd XMM0, XMM1;
	        movdqa XMM1, XMM0;
	        psrldq XMM1, 4;
	        paddd XMM0, XMM1;
	        movq RAX, XMM0;
	    done:
	        ret;
	    } else {
	        int sum = 0;
	        for (int i = 0; i < n; i += 4) {
	            sum += (t[i  ]*w[i  ] + t[i+1]*w[i+1]) >> 8;
	            sum += (t[i+2]*w[i+2] + t[i+3]*w[i+3]) >> 8;
	        }
	        return sum;
	    }
	}

This example also shows, how 'naked' should probably not be applied to the  
function declaration, because it contains non-asm code as well. (It could  
be "naked asm" though.) For compatibility with GDC (and in fact the  
original NASM code), I used extern(C) here as the parameter passing  
strategy.
This may also serve as a practical use case for vector operations.