std.range.zip performance
Andrei Alexandrescu
SeeWebsiteForEmail at erdani.org
Tue Feb 15 18:36:29 PST 2011
Initial: 58 seconds.
Eliminated the switch in popFront: 53s.
Replaced emplace with assignment: 23s.
Specialized emplace for non-struct types, reinserted: 23s.
Eliminated the loop in empty (replaced with return ranges[0].empty;): 17s.
I'm sure there are ways to further improve this, but there are a few
difficulties. Each pass through the loop the code must transport values
from the two arrays into a specific format and then distribute them for
further calculation. Then, upon each popFront two words must be touched
because arrays hold pointer+length, not pointer+pointer (as probably
would be better since ranges are around).
Nice analysis!
Andrei
On 2/15/11 6:24 PM, bearophile wrote:
> While doing some benchmarks on the Clay language I've found something interesting.
>
> I have timed a zip() done on two short arrays, and I have found this D version too much slow compared to normal array code. Higher order functions like zip, map, filter, etc are going to stay, they aren't toys, so I think the D compiler has to digest them better.
>
> (There is lot of of asm at the tail of this post, I don't know if some of t will get cut away.)
>
> Bye,
> bearophile
>
>
> Timings, seconds, best of 3:
> #1: 1.95
> #2: 61.50
> #3: 2.25
> #4: 1.52
>
> ----------------------
>
> Binary sizes, bytes:
> #1: 33_016
> #2: 284_188
> #3: 103_964
> #4: 103_964
>
> ----------------------
>
> Compilers and command line used:
>
> DMD 2.051
> clay compiler (hg id dec41ba07d58 tip, llvm r122866, Jan 5 2011)
>
> clay -inline test1.clay -o test1.exe
> dmd -O -release -inline test2.d
> dmd -O -release -inline test3.d
>
> To produce asm code with Clay:
> clay -inline -asm test1.clay -o test1.s
>
> Output: -1149672960 for all versions.
>
> ----------------------
>
> Code used:
>
>
> // program #1 (Clay)
> main() {
> var a = Vector([18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28]);
> var b = Vector([9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25]);
> var tot = 0; // 32 bit signed int
> for (i in range(0, 50*1000*1000))
> for (x, y in zipped(a, b))
> tot += x + y;
> println(tot);
> }
>
> ----------------------
>
> // program #2 (D2)
> import std.c.stdio: printf;
> import std.range: zip;
> void main() {
> auto a = [18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28];
> auto b = [9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25];
> int tot = 0;
> foreach (i; 0 .. 50_000_000)
> foreach (xy; zip(a, b))
> tot += xy[0] + xy[1];
> printf("%d\n", tot);
> }
>
> ----------------------
>
> // program #3 (D2)
> import std.c.stdio: printf;
> void main() {
> auto a = [18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28];
> auto b = [9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25];
> int tot = 0;
> foreach (i; 0 .. 50_000_000)
> foreach (j; 0 .. a.length)
> tot += a[j] + b[j];
> printf("%d\n", tot);
> }
>
> ----------------------
>
> // program #4 (D2)
> import std.c.stdio: printf;
> void main() {
> auto a = [18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28];
> auto b = [9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25];
> int tot = 0;
> foreach (i; 0 .. 50_000_000)
> foreach (j; 0 .. 30)
> tot += a[j] + b[j];
> printf("%d\n", tot);
> }
>
> ----------------------
>
> Just loops program #1:
>
> movl $50000000, %eax # imm = 0x2FAF080
> .align 16, 0x90
> LBB2_4: # %bb.nph.i.i
> # =>This Loop Header: Depth=1
> # Child Loop BB2_5 Depth 2
> xorl %edx, %edx
> .align 16, 0x90
> LBB2_5: # %return410.i.i
> # Parent Loop BB2_4 Depth=1
> # => This Inner Loop Header: Depth=2
> addl (%esi,%edx,4), %ecx
> addl (%edi,%edx,4), %ecx
> incl %edx
> cmpl $30, %edx
> jne LBB2_5
> # BB#3: # %return272.loopexit.i.i
> # in Loop: Header=BB2_4 Depth=1
> decl %eax
> jne LBB2_4
>
> ----------------------
>
> Just loops program #2:
>
> LCE: push dword ptr 018h[ESP]
> mov ESI,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip6__initZ
> push dword ptr 018h[ESP]
> push dword ptr 028h[ESP]
> push dword ptr 028h[ESP]
> push 0
> lea EDI,078h[ESP]
> movsd
> movsd
> movsd
> movsd
> movsd
> lea EAX,078h[ESP]
> call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip6__ctorMFNcAiAiE3std5range14StoppingPolicyZS3std5range14__T3ZipTAiTAiZ3Zip
> mov ESI,EAX
> lea EDI,044h[ESP]
> movsd
> movsd
> movsd
> movsd
> movsd
> lea ESI,044h[ESP]
> lea EDI,024h[ESP]
> movsd
> movsd
> movsd
> movsd
> movsd
> lea EAX,024h[ESP]
> call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb
> xor AL,1
> je L153
> L11C: lea EAX,024h[ESP]
> call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip5frontMFNdZS3std8typecons14__T5TupleTiTiZ5Tuple
> mov 07Ch[ESP],EAX
> mov ECX,07Ch[ESP]
> lea EAX,024h[ESP]
> mov 080h[ESP],EDX
> add ECX,080h[ESP]
> add EBX,ECX
> call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv
> lea EAX,024h[ESP]
> call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb
> xor AL,1
> jne L11C
> L153: inc EBP
> cmp EBP,02FAF080h
> jb LCE
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip6__ctorMFNcAiAiE3std5range14StoppingPolicyZS3std5range14__T3ZipTAiTAiZ3Zip comdat
> push EBX
> mov ECX,8[ESP]
> mov EBX,014h[ESP]
> push ESI
> mov ESI,EAX
> mov EDX,01Ch[ESP]
> mov 4[ESI],EDX
> mov EDX,014h[ESP]
> mov [ESI],EBX
> mov EBX,010h[ESP]
> mov 010h[ESI],ECX
> mov 8[ESI],EBX
> mov 0Ch[ESI],EDX
> pop ESI
> pop EBX
> ret 014h
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb comdat
> L0: sub ESP,05Ch
> mov ECX,010h[EAX]
> test ECX,ECX
> push EBX
> push ESI
> mov ESI,EAX
> je L21
> cmp ECX,1
> je LC3
> cmp ECX,2
> je L55
> jmp near ptr LC3
> L21: mov EBX,[ESI]
> mov EDX,4[ESI]
> mov 0Ch[ESP],EBX
> mov 010h[ESP],EDX
> cmp dword ptr 0Ch[ESP],0
> je L4A
> mov EBX,8[ESI]
> mov EDX,0Ch[ESI]
> mov 014h[ESP],EBX
> mov 018h[ESP],EDX
> cmp dword ptr 014h[ESP],0
> jne LC3
> L4A: pop ESI
> mov EAX,1
> pop EBX
> add ESP,05Ch
> ret
> L55: mov EBX,[ESI]
> mov EDX,4[ESI]
> mov ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb14__dgliteral828MFZAxa
> mov 01Ch[ESP],EBX
> mov 020h[ESP],EDX
> mov 02Ch[ESP],EAX
> mov 030h[ESP],ECX
> cmp dword ptr 01Ch[ESP],0
> setz DL
> mov EBX,8[ESI]
> mov 024h[ESP],EBX
> mov ECX,0Ch[ESI]
> mov 028h[ESP],ECX
> cmp dword ptr 024h[ESP],0
> setz CL
> cmp DL,CL
> mov EDX,1
> je L98
> xor EDX,EDX
> L98: xor DL,1
> je LC3
> push dword ptr FLAT:_DATA[054h]
> push dword ptr FLAT:_DATA[050h]
> push 0ADEh
> mov EAX,038h[ESP]
> mov EDX,03Ch[ESP]
> mov EBX,038h[ESP]
> call EDX
> push EDX
> push EAX
> call near ptr _D3std9exception7bailOutFAyaixAaZv
> LC3: pop ESI
> xor EAX,EAX
> pop EBX
> add ESP,05Ch
> ret
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip5frontMFNdZS3std8typecons14__T5TupleTiTiZ5Tuple comdat
> assume CS:_D3std5range14__T3ZipTAiTAiZ3Zip5frontMFNdZS3std8typecons14__T5TupleTiTiZ5Tuple
> L0: sub ESP,028h
> mov EDX,4[EAX]
> push EBX
> push ESI
> mov ESI,EAX
> mov EBX,[ESI]
> push EDI
> mov 014h[ESP],EBX
> mov 018h[ESP],EDX
> cmp dword ptr 014h[ESP],0
> je L31
> lea EDX,0Ch[ESP]
> mov EBX,[ESI]
> push EDX
> mov EDX,4[ESI]
> mov EAX,[EDX]
> push 4
> call near ptr _D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi
> jmp short L41
> L31: lea ECX,0Ch[ESP]
> mov EDI,4
> push ECX
> push EDI
> call near ptr _D3std4conv14__T7emplaceTiZ7emplaceFAvZPi
> L41: mov EBX,8[ESI]
> mov ECX,0Ch[ESI]
> test EBX,EBX
> je L61
> lea EDI,010h[ESP]
> mov EDX,0Ch[ESI]
> mov EAX,8[ESI]
> push EDI
> mov EAX,[EDX]
> push 4
> call near ptr _D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi
> jmp short L71
> L61: lea ECX,010h[ESP]
> mov ESI,4
> push ECX
> push ESI
> call near ptr _D3std4conv14__T7emplaceTiZ7emplaceFAvZPi
> L71: mov EDX,010h[ESP]
> mov EAX,0Ch[ESP]
> pop EDI
> pop ESI
> pop EBX
> add ESP,028h
> ret
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv comdat
> L0: sub ESP,04Ch
> mov ECX,010h[EAX]
> test ECX,ECX
> push EBX
> push EBP
> push ESI
> push EDI
> mov EDI,EAX
> je L1F
> cmp ECX,1
> je L4A
> cmp ECX,2
> je L8C
> jmp near ptr L142
> L1F: mov ESI,[EDI]
> mov EDX,4[EDI]
> dec ESI
> mov EBX,[EDI]
> add EDX,4
> lea EBP,8[EDI]
> mov [EDI],ESI
> mov 4[EDI],EDX
> mov ESI,0[EBP]
> mov EDX,4[EBP]
> dec ESI
> mov EBX,0[EBP]
> add EDX,4
> mov 0[EBP],ESI
> mov 4[EBP],EDX
> jmp near ptr L142
> L4A: mov ESI,[EDI]
> mov ECX,4[EDI]
> test ESI,ESI
> je L63
> mov ESI,[EDI]
> mov EDX,4[EDI]
> dec ESI
> mov EBX,[EDI]
> add EDX,4
> mov [EDI],ESI
> mov 4[EDI],EDX
> L63: mov ESI,8[EDI]
> mov ECX,0Ch[EDI]
> test ESI,ESI
> je L142
> lea EBP,8[EDI]
> mov ESI,0[EBP]
> mov EDX,4[EBP]
> dec ESI
> mov EBX,0[EBP]
> add EDX,4
> mov 0[EBP],ESI
> mov 4[EBP],EDX
> jmp near ptr L142
> L8C: mov EBX,[EDI]
> mov EDX,4[EDI]
> mov ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv14__dgliteral831MFZAxa
> mov 014h[ESP],EBX
> mov 018h[ESP],EDX
> mov 01Ch[ESP],EAX
> mov 020h[ESP],ECX
> cmp dword ptr 014h[ESP],0
> setnz DL
> xor DL,1
> je LD7
> mov EDX,ECX
> push dword ptr FLAT:_DATA[054h]
> push dword ptr FLAT:_DATA[050h]
> push 0B86h
> mov EAX,028h[ESP]
> mov EBX,028h[ESP]
> call EDX
> push EDX
> push EAX
> call near ptr _D3std9exception7bailOutFAyaixAaZv
> LD7: mov EAX,[EDI]
> mov EDX,4[EDI]
> dec EAX
> mov EBX,[EDI]
> add EDX,4
> mov 4[EDI],EDX
> mov EDX,0Ch[EDI]
> mov EBX,EDI
> mov [EDI],EAX
> mov EAX,8[EDI]
> mov ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv14__dgliteral832MFZAxa
> mov 024h[ESP],EAX
> mov 028h[ESP],EDX
> mov 02Ch[ESP],EBX
> mov 030h[ESP],ECX
> cmp dword ptr 024h[ESP],0
> setnz DL
> xor DL,1
> je L12F
> push dword ptr FLAT:_DATA[054h]
> push dword ptr FLAT:_DATA[050h]
> push 0B86h
> mov EAX,038h[ESP]
> call ECX
> push EDX
> push EAX
> call near ptr _D3std9exception7bailOutFAyaixAaZv
> L12F: lea ESI,8[EDI]
> mov EBX,[ESI]
> mov EDX,4[ESI]
> dec EBX
> mov EAX,[ESI]
> mov [ESI],EBX
> add EDX,4
> mov 4[ESI],EDX
> L142: pop EDI
> pop ESI
> pop EBP
> pop EBX
> add ESP,04Ch
> ret
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb comdat
> L0: sub ESP,05Ch
> mov ECX,010h[EAX]
> test ECX,ECX
> push EBX
> push ESI
> mov ESI,EAX
> je L21
> cmp ECX,1
> je LC3
> cmp ECX,2
> je L55
> jmp near ptr LC3
> L21: mov EBX,[ESI]
> mov EDX,4[ESI]
> mov 0Ch[ESP],EBX
> mov 010h[ESP],EDX
> cmp dword ptr 0Ch[ESP],0
> je L4A
> mov EBX,8[ESI]
> mov EDX,0Ch[ESI]
> mov 014h[ESP],EBX
> mov 018h[ESP],EDX
> cmp dword ptr 014h[ESP],0
> jne LC3
> L4A: pop ESI
> mov EAX,1
> pop EBX
> add ESP,05Ch
> ret
> L55: mov EBX,[ESI]
> mov EDX,4[ESI]
> mov ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb14__dgliteral828MFZAxa
> mov 01Ch[ESP],EBX
> mov 020h[ESP],EDX
> mov 02Ch[ESP],EAX
> mov 030h[ESP],ECX
> cmp dword ptr 01Ch[ESP],0
> setz DL
> mov EBX,8[ESI]
> mov 024h[ESP],EBX
> mov ECX,0Ch[ESI]
> mov 028h[ESP],ECX
> cmp dword ptr 024h[ESP],0
> setz CL
> cmp DL,CL
> mov EDX,1
> je L98
> xor EDX,EDX
> L98: xor DL,1
> je LC3
> push dword ptr FLAT:_DATA[054h]
> push dword ptr FLAT:_DATA[050h]
> push 0ADEh
> mov EAX,038h[ESP]
> mov EDX,03Ch[ESP]
> mov EBX,038h[ESP]
> call EDX
> push EDX
> push EAX
> call near ptr _D3std9exception7bailOutFAyaixAaZv
> LC3: pop ESI
> xor EAX,EAX
> pop EBX
> add ESP,05Ch
> ret
>
>
> _D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi comdat
> L0: push EAX
> mov EDX,offset FLAT:_D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi14__dgliteral829MFZC6object9Throwable
> push EBX
> push ESI
> cmp dword ptr 010h[ESP],4
> sbb ECX,ECX
> inc ECX
> push ECX
> lea EBX,0Ch[ESP]
> push EDX
> push EBX
> call near ptr _D3std9exception14__T7enforceTbZ7enforceFbLC6object9ThrowableZb
> mov ECX,014h[ESP]
> mov EAX,014h[ESP]
> mov ESI,8[ESP]
> mov [ECX],ESI
> pop ESI
> pop EBX
> pop ECX
> ret 8
>
>
> (I have not included all the code, like _D3std9exception7bailOutFAyaixAaZv).
>
> ----------------------
>
> Just loops program #3:
>
> LDE: xor EBX,EBX
> cmp 010h[ESP],BL
> je LF5
> LE6: mov ECX,[EBX*4][EAX]
> add ECX,[EBX*4][EDI]
> add ESI,ECX
> inc EBX
> cmp EBX,014h[ESP]
> jb LE6
> LF5: inc EDX
> cmp EDX,02FAF080h
> jb LDE
>
> ----------------------
>
> Just loops program #4:
>
> LBF: xor EBX,EBX
> LC1: mov ECX,[EBX*4][EAX]
> add ECX,[EBX*4][EDI]
> add ESI,ECX
> inc EBX
> cmp EBX,01Eh
> jb LC1
> inc EDX
> cmp EDX,02FAF080h
> jb LBF
>
> ----------------------
More information about the Digitalmars-d
mailing list