LDC 0.16.0 alpha3 is out! Get it, test it, give feedback!

Mon Sep 21 02:16:37 PDT 2015

Here is the comparison. I omitted the part of the function
that would be the same in both versions: prolog, epilog and
loading the current position into %rax.

 1st: hand-optimized ASM

  m_sse = __asm!(const(ubyte16)*)("
    1:
    vpcmpistri $3, ($1), $4
    add        $2, $1
    cmp        $2, %ecx
    je         1b
    sub        $2, $1
    add        %rcx, $1
    ", "=r,0,I,K,x,~{ecx}", m_sse, 16, mode, SIMDFromString!cs);

    c5 f8 28 05 01 74 04 00	vmovaps xmm0,XMMWORD PTR [rip+0x47401]
    c4 e3 79 63 00 08	     L: vpcmpistri xmm0,XMMWORD PTR [rax],0x8
    48 83 c0 10			add    rax,0x10
    83 f9 10			cmp    ecx,0x10
    74 f1			je     L
    48 83 e8 10			sub    rax,0x10
    48 01 c8			add    rax,rcx
    48 89 07			mov    QWORD PTR [rdi],rax
    (33 bytes)

  Destructuring a ~200 MiB JSON file takes 348 ms with that.

 2nd: "naive" approach using intrinsics

  int pos;
  do
  {
    ubyte16 sse = __builtin_ia32_lddqu(m_json);
    pos = __builtin_ia32_pcmpistri128(SIMDFromString!cs, sse, mode);
    m_json += pos;
  }
  while (pos == 16);

    48 89 7d f8			mov    QWORD PTR [rbp-0x8],rdi
    48 89 45 f0			mov    QWORD PTR [rbp-0x10],rax
    48 8b 45 f0		     L: mov    rax,QWORD PTR [rbp-0x10]
    c5 fb f0 00			vlddqu xmm0,[rax]
    c5 f8 28 0d 81 74 04 00	vmovaps xmm1,XMMWORD PTR [rip+0x47481]
    c4 e3 79 63 c8 08		vpcmpistri xmm1,xmm0,0x8
    48 63 d1			movsxd rdx,ecx
    48 01 d0			add    rax,rdx
    48 8b 55 f8			mov    rdx,QWORD PTR [rbp-0x8]
    48 89 02			mov    QWORD PTR [rdx],rax
    81 f9 10 00 00 00		cmp    ecx,0x10
    48 89 45 f0			mov    QWORD PTR [rbp-0x10],rax
    74 d1			je     L
    (55 bytes)

  Time: 502 ms

Compiled with ldc-0.16-alpha3, LLVM 3.5.2
 -O4 -mcpu=native -release -boundscheck=off -singleobj
 -disable-inlining

-- 
Marco