Does dmd have SSE intrinsics?
bearophile
bearophileHUGS at lycos.com
Tue Sep 22 11:17:37 PDT 2009
Jeremie Pelletier:
> The D memory manager already aligns data on 16 bytes boundaries. The
> only case I can think of right now is when data is in a struct or class:
LDC doesn't align to 16 the normal arrays inside functions:
A small test program:
void main() {
float[4] a = [1.0f, 2.0, 3.0, 4.0];
float[4] b, c;
b[] = 10.0f;
c[] = a[] + b[];
}
The ll code (the asm of the LLVM) LDC produces, this is the head:
ldc -O3 -inline -release -output-ll vect1.d
define x86_stdcallcc i32 @_Dmain(%"char[][]" %unnamed) {
entry:
%a = alloca [4 x float], align 4 ; <[4 x float]*> [#uses=5]
%b = alloca [4 x float], align 4 ; <[4 x float]*> [#uses=4]
%c = alloca [4 x float], align 4 ; <[4 x float]*> [#uses=4]
%.gc_mem = call noalias i8* @_d_newarrayvT(%object.TypeInfo* @_D11TypeInfo_Af6__initZ, i32 4) ; <i8*> [#uses=5]
[...]
The asm it produces for the whole main (the call to the array op is inlined, while _d_array_init_float is not inlined, I don't know why):
ldc -O3 -inline -release -output-s vect1.d
_Dmain:
pushl %esi
subl $64, %esp
movl $4, 4(%esp)
movl $_D11TypeInfo_Af6__initZ, (%esp)
call _d_newarrayvT
movl $1065353216, (%eax)
movl $1073741824, 4(%eax)
movl $1077936128, 8(%eax)
movl $1082130432, 12(%eax)
movl 8(%eax), %ecx
movl %ecx, 56(%esp)
movl 4(%eax), %ecx
movl %ecx, 52(%esp)
movl (%eax), %eax
movl %eax, 48(%esp)
movl $1082130432, 60(%esp)
leal 32(%esp), %esi
movl %esi, (%esp)
movl $2143289344, 8(%esp)
movl $4, 4(%esp)
call _d_array_init_float
leal 16(%esp), %eax
movl %eax, (%esp)
movl $2143289344, 8(%esp)
movl $4, 4(%esp)
call _d_array_init_float
movl %esi, (%esp)
movl $1092616192, 8(%esp)
movl $4, 4(%esp)
call _d_array_init_float
movss 48(%esp), %xmm0
addss 32(%esp), %xmm0
movss %xmm0, 16(%esp)
movss 52(%esp), %xmm0
addss 36(%esp), %xmm0
movss %xmm0, 20(%esp)
movss 56(%esp), %xmm0
addss 40(%esp), %xmm0
movss %xmm0, 24(%esp)
movss 60(%esp), %xmm0
addss 44(%esp), %xmm0
movss %xmm0, 28(%esp)
xorl %eax, %eax
addl $64, %esp
popl %esi
ret $8
By the way, using Link-Time Optimization and interning LDC produces this LL (whole main):
define x86_stdcallcc i32 @_Dmain(%"char[][]" %unnamed) {
entry:
%b = alloca [4 x float], align 4 ; <[4 x float]*> [#uses=1]
%c = alloca [4 x float], align 4 ; <[4 x float]*> [#uses=1]
%.gc_mem = call noalias i8* @_d_newarrayvT(%object.TypeInfo* @_D11TypeInfo_Af6__initZ, i32 4) ; <i8*> [#uses=4]
%.gc_mem1 = bitcast i8* %.gc_mem to float* ; <float*> [#uses=1]
store float 1.000000e+00, float* %.gc_mem1
%tmp3 = getelementptr i8* %.gc_mem, i32 4 ; <i8*> [#uses=1]
%0 = bitcast i8* %tmp3 to float* ; <float*> [#uses=1]
store float 2.000000e+00, float* %0
%tmp4 = getelementptr i8* %.gc_mem, i32 8 ; <i8*> [#uses=1]
%1 = bitcast i8* %tmp4 to float* ; <float*> [#uses=1]
store float 3.000000e+00, float* %1
%tmp5 = getelementptr i8* %.gc_mem, i32 12 ; <i8*> [#uses=1]
%2 = bitcast i8* %tmp5 to float* ; <float*> [#uses=1]
store float 4.000000e+00, float* %2
%tmp8 = getelementptr [4 x float]* %b, i32 0, i32 0 ; <float*> [#uses=2]
call void @_d_array_init_float(float* nocapture %tmp8, i32 4, float 0x7FF8000000000000)
%tmp9 = getelementptr [4 x float]* %c, i32 0, i32 0 ; <float*> [#uses=1]
call void @_d_array_init_float(float* nocapture %tmp9, i32 4, float 0x7FF8000000000000)
call void @_d_array_init_float(float* nocapture %tmp8, i32 4, float 1.000000e+01)
ret i32 0
}
Bye,
bearophile
More information about the Digitalmars-d
mailing list