Struct copies

Sun Jan 26 05:02:48 PST 2014

The following code is compiled with the ldc2 compiler based on 
LLVM 3.3.1.

This swaps two values in-place:

void swap(T)(ref T x, ref T y) pure nothrow {
     immutable aux = x;
     x = y;
     y = aux;
}

If I swap uint values I get the asm and IR:

__D5test611__T4swapTkZ4swapFNaNbNfKkKkZv:
	pushl	%esi
	movl	8(%esp), %ecx
	movl	(%ecx), %edx
	movl	(%eax), %esi
	movl	%esi, (%ecx)
	movl	%edx, (%eax)
	popl	%esi
	ret	$4

; Function Attrs: nounwind
define x86_stdcallcc void @"\01__D5test65swap1FNaNbKkKkZv"(i32* 
inreg nocapture %y_arg, i32* nocapture %x_arg) #0 {
entry:
   %tmp = load i32* %x_arg, align 4
   %tmp2 = load i32* %y_arg, align 4
   store i32 %tmp2, i32* %x_arg, align 4
   store i32 %tmp, i32* %y_arg, align 4
   ret void
}

Often I have a simple struct like this, with a sizeof equal to a 
size_t or two size_t (a size_t is a 32 bit unsigned on this 
system):

struct Foo {
     ushort a;
     char b, c;
}

If I instantiate the swap function template on values of type Foo 
I get the asm and IR:

__D5test621__T4swapTS5test63FooZ4swapFNaNbNfKS5test63FooKS5test63FooZv:
	pushl	%edi
	pushl	%esi
	movl	12(%esp), %ecx
	movw	(%ecx), %dx
	movw	2(%ecx), %si
	movl	(%eax), %edi
	movl	%edi, (%ecx)
	movw	%dx, (%eax)
	movw	%si, 2(%eax)
	popl	%esi
	popl	%edi
	ret	$4

; Function Attrs: nounwind
define x86_stdcallcc void 
@"\01__D5test65swap2FNaNbKS5test63FooKS5test63FooZv"(%test6.Foo* 
inreg nocapture %y_arg, %test6.Foo* nocapture %x_arg) #0 {
entry:
   %0 = getelementptr inbounds %test6.Foo* %x_arg, i32 0, i32 0
   %1 = load i16* %0, align 1
   %2 = getelementptr inbounds %test6.Foo* %x_arg, i32 0, i32 1
   %3 = load i8* %2, align 1
   %4 = getelementptr inbounds %test6.Foo* %x_arg, i32 0, i32 2
   %5 = load i8* %4, align 1
   %6 = bitcast %test6.Foo* %y_arg to i32*
   %7 = bitcast %test6.Foo* %x_arg to i32*
   %8 = load i32* %6, align 1
   store i32 %8, i32* %7, align 1
   %9 = getelementptr inbounds %test6.Foo* %y_arg, i32 0, i32 0
   store i16 %1, i16* %9, align 1
   %10 = getelementptr inbounds %test6.Foo* %y_arg, i32 0, i32 1
   store i8 %3, i8* %10, align 1
   %11 = getelementptr inbounds %test6.Foo* %y_arg, i32 0, i32 2
   store i8 %5, i8* %11, align 1
   ret void
}

If I create a new union Bar that contains a 32 bit integer that 
comprises all three Foo fields:

union Bar {
     uint all;
     struct {
         ushort a;
         char b, c;
     }
}

Now I can define a new swap function that works on values of type 
Bar:

void swap2(ref Bar x, ref Bar y) pure nothrow {
     immutable Bar aux = x;
     x.all = y.all;
     y.all = aux.all;
}

Its asm and IR are shorter:

__D5test65swap2FNaNbKS5test63BarKS5test63BarZv:
     pushl   %esi
     movl    8(%esp), %ecx
     movl    (%ecx), %edx
     movl    (%eax), %esi
     movl    %esi, (%ecx)
     movl    %edx, (%eax)
     popl    %esi
     ret $4

; Function Attrs: nounwind
define x86_stdcallcc void 
@"\01__D5test65swap3FNaNbKS5test63BarKS5test63BarZv"(%test6.Bar* 
inreg nocapture %y_arg, %test6.Bar* nocapture %x_arg) #0 {
entry:
   %0 = getelementptr inbounds %test6.Bar* %x_arg, i32 0, i32 0
   %1 = load i32* %0, align 1
   %tmp4 = getelementptr %test6.Bar* %y_arg, i32 0, i32 0
   %tmp5 = load i32* %tmp4, align 4
   store i32 %tmp5, i32* %0, align 4
   store i32 %1, i32* %tmp4, align 4
   ret void
}

In the case of swapping Foos why isn't LLVM optimizing the swap 
function to a shorter asm like swap2? I have asked this on the 
LLVM IRC channel, and aKor has told me that similar C code Clang 
on swaps two Foo using a memcpy so uses a single 32 bit copy. So 
perhaps ldc2 can do the same for this common case.

Bye,
bearophile