Does dmd have SSE intrinsics?

Tue Sep 22 11:17:37 PDT 2009

Jeremie Pelletier:

> The D memory manager already aligns data on 16 bytes boundaries. The 
> only case I can think of right now is when data is in a struct or class:

LDC doesn't align to 16 the normal arrays inside functions:
A small test program:

void main() {
    float[4] a = [1.0f, 2.0, 3.0, 4.0];
    float[4] b, c;        
    b[] = 10.0f;
    c[] = a[] + b[];
}

The ll code (the asm of the LLVM) LDC produces, this is the head:
ldc -O3 -inline -release -output-ll vect1.d

define x86_stdcallcc i32 @_Dmain(%"char[][]" %unnamed) {
entry:
  %a = alloca [4 x float], align 4                ; <[4 x float]*> [#uses=5]
  %b = alloca [4 x float], align 4                ; <[4 x float]*> [#uses=4]
  %c = alloca [4 x float], align 4                ; <[4 x float]*> [#uses=4]
  %.gc_mem = call noalias i8* @_d_newarrayvT(%object.TypeInfo* @_D11TypeInfo_Af6__initZ, i32 4) ; <i8*> [#uses=5]
[...]

The asm it produces for the whole main (the call to the array op is inlined, while _d_array_init_float is not inlined, I don't know why):
ldc -O3 -inline -release -output-s vect1.d

_Dmain:
	pushl	%esi
	subl	$64, %esp
	movl	$4, 4(%esp)
	movl	$_D11TypeInfo_Af6__initZ, (%esp)
	call	_d_newarrayvT
	movl	$1065353216, (%eax)
	movl	$1073741824, 4(%eax)
	movl	$1077936128, 8(%eax)
	movl	$1082130432, 12(%eax)
	movl	8(%eax), %ecx
	movl	%ecx, 56(%esp)
	movl	4(%eax), %ecx
	movl	%ecx, 52(%esp)
	movl	(%eax), %eax
	movl	%eax, 48(%esp)
	movl	$1082130432, 60(%esp)
	leal	32(%esp), %esi
	movl	%esi, (%esp)
	movl	$2143289344, 8(%esp)
	movl	$4, 4(%esp)
	call	_d_array_init_float
	leal	16(%esp), %eax
	movl	%eax, (%esp)
	movl	$2143289344, 8(%esp)
	movl	$4, 4(%esp)
	call	_d_array_init_float
	movl	%esi, (%esp)
	movl	$1092616192, 8(%esp)
	movl	$4, 4(%esp)
	call	_d_array_init_float
	movss	48(%esp), %xmm0
	addss	32(%esp), %xmm0
	movss	%xmm0, 16(%esp)
	movss	52(%esp), %xmm0
	addss	36(%esp), %xmm0
	movss	%xmm0, 20(%esp)
	movss	56(%esp), %xmm0
	addss	40(%esp), %xmm0
	movss	%xmm0, 24(%esp)
	movss	60(%esp), %xmm0
	addss	44(%esp), %xmm0
	movss	%xmm0, 28(%esp)
	xorl	%eax, %eax
	addl	$64, %esp
	popl	%esi
	ret	$8

By the way, using Link-Time Optimization and interning LDC produces this LL (whole main):

define x86_stdcallcc i32 @_Dmain(%"char[][]" %unnamed) {
entry:
  %b = alloca [4 x float], align 4                ; <[4 x float]*> [#uses=1]
  %c = alloca [4 x float], align 4                ; <[4 x float]*> [#uses=1]
  %.gc_mem = call noalias i8* @_d_newarrayvT(%object.TypeInfo* @_D11TypeInfo_Af6__initZ, i32 4) ; <i8*> [#uses=4]
  %.gc_mem1 = bitcast i8* %.gc_mem to float*      ; <float*> [#uses=1]
  store float 1.000000e+00, float* %.gc_mem1
  %tmp3 = getelementptr i8* %.gc_mem, i32 4       ; <i8*> [#uses=1]
  %0 = bitcast i8* %tmp3 to float*                ; <float*> [#uses=1]
  store float 2.000000e+00, float* %0
  %tmp4 = getelementptr i8* %.gc_mem, i32 8       ; <i8*> [#uses=1]
  %1 = bitcast i8* %tmp4 to float*                ; <float*> [#uses=1]
  store float 3.000000e+00, float* %1
  %tmp5 = getelementptr i8* %.gc_mem, i32 12      ; <i8*> [#uses=1]
  %2 = bitcast i8* %tmp5 to float*                ; <float*> [#uses=1]
  store float 4.000000e+00, float* %2
  %tmp8 = getelementptr [4 x float]* %b, i32 0, i32 0 ; <float*> [#uses=2]
  call void @_d_array_init_float(float* nocapture %tmp8, i32 4, float 0x7FF8000000000000)
  %tmp9 = getelementptr [4 x float]* %c, i32 0, i32 0 ; <float*> [#uses=1]
  call void @_d_array_init_float(float* nocapture %tmp9, i32 4, float 0x7FF8000000000000)
  call void @_d_array_init_float(float* nocapture %tmp8, i32 4, float 1.000000e+01)
  ret i32 0
}

Bye,
bearophile