std.range.zip performance

Andrei Alexandrescu SeeWebsiteForEmail at erdani.org
Tue Feb 15 18:36:29 PST 2011


Initial: 58 seconds.

Eliminated the switch in popFront: 53s.

Replaced emplace with assignment: 23s.

Specialized emplace for non-struct types, reinserted: 23s.

Eliminated the loop in empty (replaced with return ranges[0].empty;): 17s.

I'm sure there are ways to further improve this, but there are a few 
difficulties. Each pass through the loop the code must transport values 
from the two arrays into a specific format and then distribute them for 
further calculation. Then, upon each popFront two words must be touched 
because arrays hold pointer+length, not pointer+pointer (as probably 
would be better since ranges are around).


Nice analysis!

Andrei

On 2/15/11 6:24 PM, bearophile wrote:
> While doing some benchmarks on the Clay language I've found something interesting.
>
> I have timed a zip() done on two short arrays, and I have found this D version too much slow compared to normal array code. Higher order functions like zip, map, filter, etc are going to stay, they aren't toys, so I think the D compiler has to digest them better.
>
> (There is lot of of asm at the tail of this post, I don't know if some of t will get cut away.)
>
> Bye,
> bearophile
>
>
> Timings, seconds, best of 3:
>    #1:  1.95
>    #2: 61.50
>    #3:  2.25
>    #4:  1.52
>
> ----------------------
>
> Binary sizes, bytes:
>    #1:  33_016
>    #2: 284_188
>    #3: 103_964
>    #4: 103_964
>
> ----------------------
>
> Compilers and command line used:
>
> DMD 2.051
> clay compiler (hg id dec41ba07d58 tip, llvm r122866, Jan  5 2011)
>
> clay -inline test1.clay -o test1.exe
> dmd -O -release -inline test2.d
> dmd -O -release -inline test3.d
>
> To produce asm code with Clay:
> clay -inline -asm test1.clay -o test1.s
>
> Output: -1149672960 for all versions.
>
> ----------------------
>
> Code used:
>
>
> // program #1 (Clay)
> main() {
>      var a = Vector([18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28]);
>      var b = Vector([9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25]);
>      var tot = 0; // 32 bit signed int
>      for (i in range(0, 50*1000*1000))
>          for (x, y in zipped(a, b))
>              tot += x + y;
>      println(tot);
> }
>
> ----------------------
>
> // program #2 (D2)
> import std.c.stdio: printf;
> import std.range: zip;
> void main() {
>      auto a = [18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28];
>      auto b = [9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25];
>      int tot = 0;
>      foreach (i; 0 .. 50_000_000)
>          foreach (xy; zip(a, b))
>              tot += xy[0] + xy[1];
>      printf("%d\n", tot);
> }
>
> ----------------------
>
> // program #3 (D2)
> import std.c.stdio: printf;
> void main() {
>      auto a = [18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28];
>      auto b = [9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25];
>      int tot = 0;
>      foreach (i; 0 .. 50_000_000)
>          foreach (j; 0 .. a.length)
>              tot += a[j] + b[j];
>      printf("%d\n", tot);
> }
>
> ----------------------
>
> // program #4 (D2)
> import std.c.stdio: printf;
> void main() {
>      auto a = [18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28];
>      auto b = [9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25];
>      int tot = 0;
>      foreach (i; 0 .. 50_000_000)
>          foreach (j; 0 .. 30)
>              tot += a[j] + b[j];
>      printf("%d\n", tot);
> }
>
> ----------------------
>
> Just loops program #1:
>
>      movl    $50000000, %eax         # imm = 0x2FAF080
>      .align    16, 0x90
> LBB2_4:                                 # %bb.nph.i.i
>                                          # =>This Loop Header: Depth=1
>                                          #     Child Loop BB2_5 Depth 2
>      xorl    %edx, %edx
>      .align    16, 0x90
> LBB2_5:                                 # %return410.i.i
>                                          #   Parent Loop BB2_4 Depth=1
>                                          # =>   This Inner Loop Header: Depth=2
>      addl    (%esi,%edx,4), %ecx
>      addl    (%edi,%edx,4), %ecx
>      incl    %edx
>      cmpl    $30, %edx
>      jne    LBB2_5
> # BB#3:                                 # %return272.loopexit.i.i
>                                          #   in Loop: Header=BB2_4 Depth=1
>      decl    %eax
>      jne    LBB2_4
>
> ----------------------
>
> Just loops program #2:
>
> LCE:        push    dword ptr 018h[ESP]
>          mov    ESI,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip6__initZ
>          push    dword ptr 018h[ESP]
>          push    dword ptr 028h[ESP]
>          push    dword ptr 028h[ESP]
>          push    0
>          lea    EDI,078h[ESP]
>          movsd
>          movsd
>          movsd
>          movsd
>          movsd
>          lea    EAX,078h[ESP]
>          call    near ptr _D3std5range14__T3ZipTAiTAiZ3Zip6__ctorMFNcAiAiE3std5range14StoppingPolicyZS3std5range14__T3ZipTAiTAiZ3Zip
>          mov    ESI,EAX
>          lea    EDI,044h[ESP]
>          movsd
>          movsd
>          movsd
>          movsd
>          movsd
>          lea    ESI,044h[ESP]
>          lea    EDI,024h[ESP]
>          movsd
>          movsd
>          movsd
>          movsd
>          movsd
>          lea    EAX,024h[ESP]
>          call    near ptr _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb
>          xor    AL,1
>          je    L153
> L11C:        lea    EAX,024h[ESP]
>          call    near ptr _D3std5range14__T3ZipTAiTAiZ3Zip5frontMFNdZS3std8typecons14__T5TupleTiTiZ5Tuple
>          mov    07Ch[ESP],EAX
>          mov    ECX,07Ch[ESP]
>          lea    EAX,024h[ESP]
>          mov    080h[ESP],EDX
>          add    ECX,080h[ESP]
>          add    EBX,ECX
>          call    near ptr _D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv
>          lea    EAX,024h[ESP]
>          call    near ptr _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb
>          xor    AL,1
>          jne    L11C
> L153:        inc    EBP
>          cmp    EBP,02FAF080h
>          jb    LCE
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip6__ctorMFNcAiAiE3std5range14StoppingPolicyZS3std5range14__T3ZipTAiTAiZ3Zip    comdat
>          push    EBX
>          mov    ECX,8[ESP]
>          mov    EBX,014h[ESP]
>          push    ESI
>          mov    ESI,EAX
>          mov    EDX,01Ch[ESP]
>          mov    4[ESI],EDX
>          mov    EDX,014h[ESP]
>          mov    [ESI],EBX
>          mov    EBX,010h[ESP]
>          mov    010h[ESI],ECX
>          mov    8[ESI],EBX
>          mov    0Ch[ESI],EDX
>          pop    ESI
>          pop    EBX
>          ret    014h
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb    comdat
> L0:        sub    ESP,05Ch
>          mov    ECX,010h[EAX]
>          test    ECX,ECX
>          push    EBX
>          push    ESI
>          mov    ESI,EAX
>          je    L21
>          cmp    ECX,1
>          je    LC3
>          cmp    ECX,2
>          je    L55
>          jmp    near ptr LC3
> L21:        mov    EBX,[ESI]
>          mov    EDX,4[ESI]
>          mov    0Ch[ESP],EBX
>          mov    010h[ESP],EDX
>          cmp    dword ptr 0Ch[ESP],0
>          je    L4A
>          mov    EBX,8[ESI]
>          mov    EDX,0Ch[ESI]
>          mov    014h[ESP],EBX
>          mov    018h[ESP],EDX
>          cmp    dword ptr 014h[ESP],0
>          jne    LC3
> L4A:        pop    ESI
>          mov    EAX,1
>          pop    EBX
>          add    ESP,05Ch
>          ret
> L55:        mov    EBX,[ESI]
>          mov    EDX,4[ESI]
>          mov    ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb14__dgliteral828MFZAxa
>          mov    01Ch[ESP],EBX
>          mov    020h[ESP],EDX
>          mov    02Ch[ESP],EAX
>          mov    030h[ESP],ECX
>          cmp    dword ptr 01Ch[ESP],0
>          setz    DL
>          mov    EBX,8[ESI]
>          mov    024h[ESP],EBX
>          mov    ECX,0Ch[ESI]
>          mov    028h[ESP],ECX
>          cmp    dword ptr 024h[ESP],0
>          setz    CL
>          cmp    DL,CL
>          mov    EDX,1
>          je    L98
>          xor    EDX,EDX
> L98:        xor    DL,1
>          je    LC3
>          push    dword ptr FLAT:_DATA[054h]
>          push    dword ptr FLAT:_DATA[050h]
>          push    0ADEh
>          mov    EAX,038h[ESP]
>          mov    EDX,03Ch[ESP]
>          mov    EBX,038h[ESP]
>          call    EDX
>          push    EDX
>          push    EAX
>          call    near ptr _D3std9exception7bailOutFAyaixAaZv
> LC3:        pop    ESI
>          xor    EAX,EAX
>          pop    EBX
>          add    ESP,05Ch
>          ret
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip5frontMFNdZS3std8typecons14__T5TupleTiTiZ5Tuple    comdat
>      assume    CS:_D3std5range14__T3ZipTAiTAiZ3Zip5frontMFNdZS3std8typecons14__T5TupleTiTiZ5Tuple
> L0:        sub    ESP,028h
>          mov    EDX,4[EAX]
>          push    EBX
>          push    ESI
>          mov    ESI,EAX
>          mov    EBX,[ESI]
>          push    EDI
>          mov    014h[ESP],EBX
>          mov    018h[ESP],EDX
>          cmp    dword ptr 014h[ESP],0
>          je    L31
>          lea    EDX,0Ch[ESP]
>          mov    EBX,[ESI]
>          push    EDX
>          mov    EDX,4[ESI]
>          mov    EAX,[EDX]
>          push    4
>          call    near ptr _D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi
>          jmp short    L41
> L31:        lea    ECX,0Ch[ESP]
>          mov    EDI,4
>          push    ECX
>          push    EDI
>          call    near ptr _D3std4conv14__T7emplaceTiZ7emplaceFAvZPi
> L41:        mov    EBX,8[ESI]
>          mov    ECX,0Ch[ESI]
>          test    EBX,EBX
>          je    L61
>          lea    EDI,010h[ESP]
>          mov    EDX,0Ch[ESI]
>          mov    EAX,8[ESI]
>          push    EDI
>          mov    EAX,[EDX]
>          push    4
>          call    near ptr _D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi
>          jmp short    L71
> L61:        lea    ECX,010h[ESP]
>          mov    ESI,4
>          push    ECX
>          push    ESI
>          call    near ptr _D3std4conv14__T7emplaceTiZ7emplaceFAvZPi
> L71:        mov    EDX,010h[ESP]
>          mov    EAX,0Ch[ESP]
>          pop    EDI
>          pop    ESI
>          pop    EBX
>          add    ESP,028h
>          ret
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv    comdat
> L0:        sub    ESP,04Ch
>          mov    ECX,010h[EAX]
>          test    ECX,ECX
>          push    EBX
>          push    EBP
>          push    ESI
>          push    EDI
>          mov    EDI,EAX
>          je    L1F
>          cmp    ECX,1
>          je    L4A
>          cmp    ECX,2
>          je    L8C
>          jmp    near ptr L142
> L1F:        mov    ESI,[EDI]
>          mov    EDX,4[EDI]
>          dec    ESI
>          mov    EBX,[EDI]
>          add    EDX,4
>          lea    EBP,8[EDI]
>          mov    [EDI],ESI
>          mov    4[EDI],EDX
>          mov    ESI,0[EBP]
>          mov    EDX,4[EBP]
>          dec    ESI
>          mov    EBX,0[EBP]
>          add    EDX,4
>          mov    0[EBP],ESI
>          mov    4[EBP],EDX
>          jmp    near ptr L142
> L4A:        mov    ESI,[EDI]
>          mov    ECX,4[EDI]
>          test    ESI,ESI
>          je    L63
>          mov    ESI,[EDI]
>          mov    EDX,4[EDI]
>          dec    ESI
>          mov    EBX,[EDI]
>          add    EDX,4
>          mov    [EDI],ESI
>          mov    4[EDI],EDX
> L63:        mov    ESI,8[EDI]
>          mov    ECX,0Ch[EDI]
>          test    ESI,ESI
>          je    L142
>          lea    EBP,8[EDI]
>          mov    ESI,0[EBP]
>          mov    EDX,4[EBP]
>          dec    ESI
>          mov    EBX,0[EBP]
>          add    EDX,4
>          mov    0[EBP],ESI
>          mov    4[EBP],EDX
>          jmp    near ptr L142
> L8C:        mov    EBX,[EDI]
>          mov    EDX,4[EDI]
>          mov    ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv14__dgliteral831MFZAxa
>          mov    014h[ESP],EBX
>          mov    018h[ESP],EDX
>          mov    01Ch[ESP],EAX
>          mov    020h[ESP],ECX
>          cmp    dword ptr 014h[ESP],0
>          setnz    DL
>          xor    DL,1
>          je    LD7
>          mov    EDX,ECX
>          push    dword ptr FLAT:_DATA[054h]
>          push    dword ptr FLAT:_DATA[050h]
>          push    0B86h
>          mov    EAX,028h[ESP]
>          mov    EBX,028h[ESP]
>          call    EDX
>          push    EDX
>          push    EAX
>          call    near ptr _D3std9exception7bailOutFAyaixAaZv
> LD7:        mov    EAX,[EDI]
>          mov    EDX,4[EDI]
>          dec    EAX
>          mov    EBX,[EDI]
>          add    EDX,4
>          mov    4[EDI],EDX
>          mov    EDX,0Ch[EDI]
>          mov    EBX,EDI
>          mov    [EDI],EAX
>          mov    EAX,8[EDI]
>          mov    ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv14__dgliteral832MFZAxa
>          mov    024h[ESP],EAX
>          mov    028h[ESP],EDX
>          mov    02Ch[ESP],EBX
>          mov    030h[ESP],ECX
>          cmp    dword ptr 024h[ESP],0
>          setnz    DL
>          xor    DL,1
>          je    L12F
>          push    dword ptr FLAT:_DATA[054h]
>          push    dword ptr FLAT:_DATA[050h]
>          push    0B86h
>          mov    EAX,038h[ESP]
>          call    ECX
>          push    EDX
>          push    EAX
>          call    near ptr _D3std9exception7bailOutFAyaixAaZv
> L12F:        lea    ESI,8[EDI]
>          mov    EBX,[ESI]
>          mov    EDX,4[ESI]
>          dec    EBX
>          mov    EAX,[ESI]
>          mov    [ESI],EBX
>          add    EDX,4
>          mov    4[ESI],EDX
> L142:        pop    EDI
>          pop    ESI
>          pop    EBP
>          pop    EBX
>          add    ESP,04Ch
>          ret
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb    comdat
> L0:        sub    ESP,05Ch
>          mov    ECX,010h[EAX]
>          test    ECX,ECX
>          push    EBX
>          push    ESI
>          mov    ESI,EAX
>          je    L21
>          cmp    ECX,1
>          je    LC3
>          cmp    ECX,2
>          je    L55
>          jmp    near ptr LC3
> L21:        mov    EBX,[ESI]
>          mov    EDX,4[ESI]
>          mov    0Ch[ESP],EBX
>          mov    010h[ESP],EDX
>          cmp    dword ptr 0Ch[ESP],0
>          je    L4A
>          mov    EBX,8[ESI]
>          mov    EDX,0Ch[ESI]
>          mov    014h[ESP],EBX
>          mov    018h[ESP],EDX
>          cmp    dword ptr 014h[ESP],0
>          jne    LC3
> L4A:        pop    ESI
>          mov    EAX,1
>          pop    EBX
>          add    ESP,05Ch
>          ret
> L55:        mov    EBX,[ESI]
>          mov    EDX,4[ESI]
>          mov    ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb14__dgliteral828MFZAxa
>          mov    01Ch[ESP],EBX
>          mov    020h[ESP],EDX
>          mov    02Ch[ESP],EAX
>          mov    030h[ESP],ECX
>          cmp    dword ptr 01Ch[ESP],0
>          setz    DL
>          mov    EBX,8[ESI]
>          mov    024h[ESP],EBX
>          mov    ECX,0Ch[ESI]
>          mov    028h[ESP],ECX
>          cmp    dword ptr 024h[ESP],0
>          setz    CL
>          cmp    DL,CL
>          mov    EDX,1
>          je    L98
>          xor    EDX,EDX
> L98:        xor    DL,1
>          je    LC3
>          push    dword ptr FLAT:_DATA[054h]
>          push    dword ptr FLAT:_DATA[050h]
>          push    0ADEh
>          mov    EAX,038h[ESP]
>          mov    EDX,03Ch[ESP]
>          mov    EBX,038h[ESP]
>          call    EDX
>          push    EDX
>          push    EAX
>          call    near ptr _D3std9exception7bailOutFAyaixAaZv
> LC3:        pop    ESI
>          xor    EAX,EAX
>          pop    EBX
>          add    ESP,05Ch
>          ret
>
>
> _D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi    comdat
> L0:        push    EAX
>          mov    EDX,offset FLAT:_D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi14__dgliteral829MFZC6object9Throwable
>          push    EBX
>          push    ESI
>          cmp    dword ptr 010h[ESP],4
>          sbb    ECX,ECX
>          inc    ECX
>          push    ECX
>          lea    EBX,0Ch[ESP]
>          push    EDX
>          push    EBX
>          call    near ptr _D3std9exception14__T7enforceTbZ7enforceFbLC6object9ThrowableZb
>          mov    ECX,014h[ESP]
>          mov    EAX,014h[ESP]
>          mov    ESI,8[ESP]
>          mov    [ECX],ESI
>          pop    ESI
>          pop    EBX
>          pop    ECX
>          ret    8
>
>
> (I have not included all the code, like _D3std9exception7bailOutFAyaixAaZv).
>
> ----------------------
>
> Just loops program #3:
>
> LDE:    xor    EBX,EBX
>          cmp    010h[ESP],BL
>          je    LF5
> LE6:    mov    ECX,[EBX*4][EAX]
>          add    ECX,[EBX*4][EDI]
>          add    ESI,ECX
>          inc    EBX
>          cmp    EBX,014h[ESP]
>          jb    LE6
> LF5:    inc    EDX
>          cmp    EDX,02FAF080h
>          jb    LDE
>
> ----------------------
>
> Just loops program #4:
>
> LBF:    xor    EBX,EBX
> LC1:    mov    ECX,[EBX*4][EAX]
>          add    ECX,[EBX*4][EDI]
>          add    ESI,ECX
>          inc    EBX
>          cmp    EBX,01Eh
>          jb    LC1
>          inc    EDX
>          cmp    EDX,02FAF080h
>          jb    LBF
>
> ----------------------



More information about the Digitalmars-d mailing list