Optimisation question

Fri Apr 10 14:17:41 PDT 2015

On 10 April 2015 at 20:18, John Colvin via D.gnu <d.gnu at puremagic.com> wrote:
> void mul(float[] a, float v)
> {
>   if ((cast(size_t)a.ptr) % 32 == 0
>     && a.length == 16)
>   {
>     foreach (ref el; a)
>       el *= v;
>   }
> }
>
> with
> -Ofast -march=broadwell -frelease
> becomes
>
> void example.mul(float[], float):
>         movq    %rsi, %rax
>         andl    $31, %eax
>         jne     .L44
>         cmpq    $16, %rdi
>         jne     .L44
>         shrq    $2, %rax
>         negq    %rax
>         andl    $7, %eax
>         je      .L10
>         vmulss  (%rsi), %xmm0, %xmm1
>         vmovss  %xmm1, (%rsi)
>         cmpq    $1, %rax
>         je      .L11
>         vmulss  4(%rsi), %xmm0, %xmm1
>         vmovss  %xmm1, 4(%rsi)
>         cmpq    $2, %rax
>         je      .L12
>         vmulss  8(%rsi), %xmm0, %xmm1
>         vmovss  %xmm1, 8(%rsi)
>         cmpq    $3, %rax
>         je      .L13
>         vmulss  12(%rsi), %xmm0, %xmm1
>         vmovss  %xmm1, 12(%rsi)
>         cmpq    $4, %rax
>         je      .L14
>         vmulss  16(%rsi), %xmm0, %xmm1
>         vmovss  %xmm1, 16(%rsi)
>         cmpq    $5, %rax
>         je      .L15
>         vmulss  20(%rsi), %xmm0, %xmm1
>         vmovss  %xmm1, 20(%rsi)
>         cmpq    $6, %rax
>         je      .L16
>         vmulss  24(%rsi), %xmm0, %xmm1
>         movl    $9, %edx
>         movl    $7, %r9d
>         vmovss  %xmm1, 24(%rsi)
> .L5:
>         movl    $16, %edi
>         movl    $8, %r8d
>         movl    $1, %r10d
>         subq    %rax, %rdi
> .L4:
>         leaq    (%rsi,%rax,4), %rcx
>         vbroadcastss    %xmm0, %ymm1
>         vmulps  (%rcx), %ymm1, %ymm2
>         vmovaps %ymm2, (%rcx)
>         cmpq    $1, %r10
>         je      .L6
>         vmulps  32(%rcx), %ymm1, %ymm1
>         vmovaps %ymm1, 32(%rcx)
> .L6:
>         leaq    (%r9,%r8), %rax
>         subq    %r8, %rdx
>         cmpq    %r8, %rdi
>         je      .L43
>         leaq    (%rsi,%rax,4), %rcx
>         vmulss  (%rcx), %xmm0, %xmm1
>         vmovss  %xmm1, (%rcx)
>         leaq    1(%rax), %rcx
>         cmpq    $1, %rdx
>         je      .L43
>         leaq    (%rsi,%rcx,4), %rcx
>         vmulss  (%rcx), %xmm0, %xmm1
>         vmovss  %xmm1, (%rcx)
>         leaq    2(%rax), %rcx
>         cmpq    $2, %rdx
>         je      .L43
>         leaq    (%rsi,%rcx,4), %rcx
>         vmulss  (%rcx), %xmm0, %xmm1
>         vmovss  %xmm1, (%rcx)
>         leaq    3(%rax), %rcx
>         cmpq    $3, %rdx
>         je      .L43
>         leaq    (%rsi,%rcx,4), %rcx
>         vmulss  (%rcx), %xmm0, %xmm1
>         vmovss  %xmm1, (%rcx)
>         leaq    4(%rax), %rcx
>         cmpq    $4, %rdx
>         je      .L43
>         leaq    (%rsi,%rcx,4), %rcx
>         vmulss  (%rcx), %xmm0, %xmm1
>         vmovss  %xmm1, (%rcx)
>         leaq    5(%rax), %rcx
>         cmpq    $5, %rdx
>         je      .L43
>         leaq    (%rsi,%rcx,4), %rcx
>         addq    $6, %rax
>         vmulss  (%rcx), %xmm0, %xmm1
>         vmovss  %xmm1, (%rcx)
>         cmpq    $6, %rdx
>         je      .L43
>         leaq    (%rsi,%rax,4), %rax
>         vmulss  (%rax), %xmm0, %xmm0
>         vmovss  %xmm0, (%rax)
>         vzeroupper
>         ret
> .L43:
>         vzeroupper
> .L44:
>         ret
> .L10:
>         movl    $16, %r8d
>         movl    $2, %r10d
>         movl    $16, %edi
>         movl    $16, %edx
>         xorl    %r9d, %r9d
>         jmp     .L4
> .L11:
>         movl    $15, %edx
>         movl    $1, %r9d
>         jmp     .L5
> .L16:
>         movl    $10, %edx
>         movl    $6, %r9d
>         jmp     .L5
> .L15:
>         movl    $11, %edx
>         movl    $5, %r9d
>         jmp     .L5
> .L14:
>         movl    $12, %edx
>         movl    $4, %r9d
>         jmp     .L5
> .L13:
>         movl    $13, %edx
>         movl    $3, %r9d
>         jmp     .L5
> .L12:
>         movl    $14, %edx
>         movl    $2, %r9d
>         jmp     .L5
>
> Which seems like an awful lot of code, wouldn't you say?
>
> I was expecting something along the lines of this (untested):
>
> void example.mul(float[], float):
>         testb   $31, %sil
>         jne     .L44
>         cmpq    $16, %rdi
>         jne     .L44
>         vbroadcastss    xmm0, ymm2
>         vmulps  (%rsi), ymm2, ymm0
>         vmulps  32(%rsi), ymm2, ymm1
>         vmovaps ymm0, (%rsi)
>         vmovaps ymm1, 32(%rsi)
> .L44:
>         ret
>
> Am I being stupid, or is the optimiser making a complete hash of things?

I fear that I cannot reproduce on gcc-5, maybe is a problem specific
to your gcc version?

_D6nested3mulFAffZv:
        testb   $31, %sil
        jne     .L8
        cmpq    $16, %rdi
        jne     .L8
        vbroadcastss    %xmm0, %ymm0
        vmulps  (%rsi), %ymm0, %ymm1
        vmulps  32(%rsi), %ymm0, %ymm0
        vmovaps %ymm1, (%rsi)
        vmovaps %ymm0, 32(%rsi)
        vzeroupper
.L8:
        ret

Iain.