Disappointing math performance compared to GDC

Wed Oct 8 09:23:18 PDT 2014

On Wednesday, 8 October 2014 at 16:02:17 UTC, Trass3r wrote:
> Just check it with '-output-ll' or '-output-s 
> -x86-asm-syntax=intel' ;)

I'm not an ASM expert, but as far as I can see it indeed use some 
SIMD registers and instructions. For examlple:

.LBB0_16:
	mov	rcx, qword ptr [rax]
	mov	rdi, rax
	call	qword ptr [rcx + 56]
	test	rax, rax
	jne	.LBB0_18
	movss	xmm1, dword ptr [rsp + 116]
	jmp	.LBB0_20
	.align	16, 0x90
.LBB0_18:
	mov	rcx, rbx
	imul	rcx, rax
	add	r12, rcx
	movss	xmm1, dword ptr [rsp + 116]
	.align	16, 0x90
.LBB0_19:
	movss	xmm0, dword ptr [rdx]
	mulss	xmm0, dword ptr [r12]
	addss	xmm1, xmm0
	add	rdx, 4
	add	r12, 4
	dec	rax
	jne	.LBB0_19
.LBB0_20:
	movss	dword ptr [rsp + 116], xmm1
	inc	r14
	cmp	r14, r15
	jne	.LBB0_12
.LBB0_21:
	mov	rax, qword ptr [rsp + 80]
	mov	rdi, qword ptr [rax]
	mov	rax, qword ptr [rdi]
	call	qword ptr [rax + 40]
	test	eax, eax
	mov	rbp, qword ptr [rsp + 104]
	jne	.LBB0_24
	movss	xmm0, dword ptr [rsp + 92]
	movss	xmm1, dword ptr [rsp + 116]
	call	_D8nhelpers7sigmoidFNbffZf
	mov	rax, qword ptr [rsp + 64]
	movss	dword ptr [rax + 4*rbp], xmm0
	xor	edx, edx
	xor	ecx, ecx
	mov	r8d, _D11TypeInfo_Af6__initZ
	mov	rdi, qword ptr [rsp + 48]
	mov	rsi, qword ptr [rsp + 96]
	call	_adEq2
	test	eax, eax
	jne	.LBB0_27
	movss	xmm0, dword ptr [rsp + 92]
	movss	xmm1, dword ptr [rsp + 116]
	call	_D8nhelpers12sigmoidDerivFNbffZf
	mov	rax, qword ptr [rsp + 96]
	jmp	.LBB0_26
	.align	16, 0x90
.LBB0_24:
	movss	xmm0, dword ptr [rsp + 92]
	movss	xmm1, dword ptr [rsp + 116]
	call	_D8nhelpers6linearFNbffZf
	mov	rax, qword ptr [rsp + 64]
	movss	dword ptr [rax + 4*rbp], xmm0
	xor	edx, edx
	xor	ecx, ecx
	mov	r8d, _D11TypeInfo_Af6__initZ
	mov	rdi, qword ptr [rsp + 48]
	mov	rsi, qword ptr [rsp + 96]
	call	_adEq2
	test	eax, eax
	jne	.LBB0_27
	mov	rax, qword ptr [rsp + 96]
	movss	xmm0, dword ptr [rsp + 92]
.LBB0_26:
	movss	dword ptr [rax + 4*rbp], xmm0
.LBB0_27:
	inc	rbp
	add	rbx, 4
	cmp	rbp, qword ptr [rsp + 72]
	jne	.LBB0_9
.LBB0_28:
	mov	rax, qword ptr [rsp + 24]
	inc	rax
	cmp	rax, qword ptr [rsp + 8]
	mov	rbp, qword ptr [rsp + 16]
	jne	.LBB0_1
.LBB0_29:
	add	rsp, 120
	pop	rbx
	pop	r12
	pop	r13
	pop	r14
	pop	r15
	pop	rbp
	ret