Optimize away immediately-called delegate literals?
H. S. Teoh
hsteoh at quickfur.ath.cx
Mon Mar 12 17:28:15 PDT 2012
On Tue, Mar 13, 2012 at 12:15:02AM +0100, Peter Alexander wrote:
> On Sunday, 11 March 2012 at 06:49:27 UTC, H. S. Teoh wrote:
> >On Sun, Mar 11, 2012 at 01:29:01AM -0500, Nick Sabalausky wrote:
> >>Suppose you have a delegate literal and immediately call it:
> >>
> >>auto a = x + (){ doStuff(); return y; }() + z;
> >>
> >>Does DMD ever (or always?) optimize away a delegate if it's executed
> >>immediately and never stored into a variable? If not, can it, and
> >>would it be a simple change? Is something like this already on the
> >>table?
> >[...]
> >
> >I've always wondered about whether delegates passed to opApply
> >ever get
> >inlined.
>
> Don't wonder. Find out!
>
> import std.stdio;
> void doStuff() { writeln("Howdy!"); }
> void main() {
> int x = 1, y = 2, z = 3;
> auto a = x + (){ doStuff(); return y; }() + z;
> writeln(a);
> }
>
> $ dmd test.d -O -release -inline
>
> __Dmain:
> 000000010000106c pushq %rbp
> 000000010000106d movq %rsp,%rbp
> 0000000100001070 pushq %rax
> 0000000100001071 pushq %rbx
> 0000000100001072 movq $0x000000000000000c,%rdi
> 000000010000107c callq 0x1000237f0 ; symbol stub for:
> __d_allocmemory
> 0000000100001081 movq %rax,%rbx
> 0000000100001084 movq $0x00000000,(%rbx)
> 000000010000108b movl $0x00000002,0x08(%rbx)
> 0000000100001092 movq %rbx,%rdi
> 0000000100001095 call *0x0002318c(%rip)
> 000000010000109c leal 0x04(%rax),%edx
> 000000010000109f movl $0x0000000a,%esi
> 00000001000010a4 leaq 0x00033eed(%rip),%rdi
> 00000001000010ab callq 0x10002319c ; symbol stub for:
> _D3std5stdio4File14__T5writeTiTaZ5writeMFiaZv
> 00000001000010b0 xorl %eax,%eax
> 00000001000010b2 popq %rbx
> 00000001000010b3 movq %rbp,%rsp
> 00000001000010b6 popq %rbp
> 00000001000010b7 ret
>
> In short. No! It doesn't currently inline in this case.
>
> Even if the lambda just returns a constant, it doesn't get inlined.
Hmph.
I tried this code:
import std.stdio;
struct A {
int[] data;
int opApply(int delegate(ref int) dg) {
foreach (d; data) {
if (dg(d)) return 1;
}
return 0;
}
}
void main() {
A a;
int n = 0;
a.data = [1,2,3,4,5];
foreach (d; a) {
n++;
}
}
With both dmd and gdc, the delegate is never inlined. :-( Compiling
with gdc -O3 causes opApply to get inlined and loop-unrolled, but the
call to the delegate is still there. With dmd -O, even opApply is not
inlined, and the code is generally much longer per loop iteration than
gdc -O3.
Here's the code generated by gdc for the foreach() loop in main():
404839: 48 89 c3 mov %rax,%rbx
40483c: 48 8b 04 24 mov (%rsp),%rax
404840: 48 8d 74 24 3c lea 0x3c(%rsp),%rsi
404845: 48 8d 7c 24 20 lea 0x20(%rsp),%rdi
40484a: 48 89 03 mov %rax,(%rbx)
40484d: 48 8b 44 24 08 mov 0x8(%rsp),%rax
404852: 48 89 43 08 mov %rax,0x8(%rbx)
404856: 8b 44 24 10 mov 0x10(%rsp),%eax
40485a: 89 43 10 mov %eax,0x10(%rbx)
40485d: 8b 03 mov (%rbx),%eax
40485f: 89 44 24 3c mov %eax,0x3c(%rsp)
404863: ff d5 callq *%rbp
404865: 85 c0 test %eax,%eax
404867: 75 58 jne 4048c1 <_Dmain+0xd1>
404869: 8b 43 04 mov 0x4(%rbx),%eax
40486c: 48 8d 74 24 3c lea 0x3c(%rsp),%rsi
404871: 48 8d 7c 24 20 lea 0x20(%rsp),%rdi
404876: 89 44 24 3c mov %eax,0x3c(%rsp)
40487a: ff d5 callq *%rbp
40487c: 85 c0 test %eax,%eax
40487e: 75 41 jne 4048c1 <_Dmain+0xd1>
404880: 8b 43 08 mov 0x8(%rbx),%eax
404883: 48 8d 74 24 3c lea 0x3c(%rsp),%rsi
404888: 48 8d 7c 24 20 lea 0x20(%rsp),%rdi
40488d: 89 44 24 3c mov %eax,0x3c(%rsp)
404891: ff d5 callq *%rbp
404893: 85 c0 test %eax,%eax
404895: 75 2a jne 4048c1 <_Dmain+0xd1>
404897: 8b 43 0c mov 0xc(%rbx),%eax
40489a: 48 8d 74 24 3c lea 0x3c(%rsp),%rsi
40489f: 48 8d 7c 24 20 lea 0x20(%rsp),%rdi
4048a4: 89 44 24 3c mov %eax,0x3c(%rsp)
4048a8: ff d5 callq *%rbp
4048aa: 85 c0 test %eax,%eax
4048ac: 75 13 jne 4048c1 <_Dmain+0xd1>
4048ae: 8b 43 10 mov 0x10(%rbx),%eax
4048b1: 48 8d 74 24 3c lea 0x3c(%rsp),%rsi
4048b6: 48 8d 7c 24 20 lea 0x20(%rsp),%rdi
4048bb: 89 44 24 3c mov %eax,0x3c(%rsp)
4048bf: ff d5 callq *%rbp
4048c1: 48 83 c4 48 add $0x48,%rsp
4048c5: 31 c0 xor %eax,%eax
4048c7: 5b pop %rbx
4048c8: 5d pop %rbp
4048c9: c3 retq
Notice that each loop iteration is only 7 instructions, with array
elements loaded directly via an offset. The loop in opApply generated
by dmd looks like this:
08049314 <_D4test1A7opApplyMFDFKiZiZi>:
8049314: 55 push %ebp
8049315: 8b ec mov %esp,%ebp
8049317: 83 ec 18 sub $0x18,%esp
804931a: 53 push %ebx
804931b: 56 push %esi
804931c: 89 45 f8 mov %eax,-0x8(%ebp)
804931f: 83 7d f8 00 cmpl $0x0,-0x8(%ebp)
8049323: 75 1f jne 8049344 <_D4test1A7opApplyMFDFKiZiZi+0x30>
8049325: 6a 06 push $0x6
8049327: ff 35 d4 a3 05 08 pushl 0x805a3d4
804932d: ff 35 d0 a3 05 08 pushl 0x805a3d0
8049333: ff 35 ec a3 05 08 pushl 0x805a3ec
8049339: ff 35 e8 a3 05 08 pushl 0x805a3e8
804933f: e8 8c 04 00 00 call 80497d0 <_d_assert_msg>
8049344: 8b 45 f8 mov -0x8(%ebp),%eax
8049347: 8b 50 04 mov 0x4(%eax),%edx
804934a: 8b 00 mov (%eax),%eax
804934c: 89 45 e8 mov %eax,-0x18(%ebp)
804934f: 89 55 ec mov %edx,-0x14(%ebp)
8049352: c7 45 f0 00 00 00 00 movl $0x0,-0x10(%ebp)
8049359: 8b 4d f0 mov -0x10(%ebp),%ecx
804935c: 3b 4d e8 cmp -0x18(%ebp),%ecx
804935f: 73 41 jae 80493a2 <_D4test1A7opApplyMFDFKiZiZi+0x8e>
8049361: 8b 5d f0 mov -0x10(%ebp),%ebx
8049364: 3b 5d e8 cmp -0x18(%ebp),%ebx
8049367: 72 0a jb 8049373 <_D4test1A7opApplyMFDFKiZiZi+0x5f>
8049369: b8 07 00 00 00 mov $0x7,%eax
804936e: e8 b9 00 00 00 call 804942c <_D4test7__arrayZ>
8049373: 8b 55 ec mov -0x14(%ebp),%edx
8049376: 8b 45 e8 mov -0x18(%ebp),%eax
8049379: 8b 34 9a mov (%edx,%ebx,4),%esi
804937c: 89 75 f4 mov %esi,-0xc(%ebp)
804937f: 8d 4d f4 lea -0xc(%ebp),%ecx
8049382: 51 push %ecx
8049383: 8b 45 08 mov 0x8(%ebp),%eax
8049386: 8b 55 0c mov 0xc(%ebp),%edx
8049389: 8b 5d 08 mov 0x8(%ebp),%ebx
804938c: ff d2 call *%edx
804938e: 85 c0 test %eax,%eax
8049390: 74 0b je 804939d <_D4test1A7opApplyMFDFKiZiZi+0x89>
8049392: b8 01 00 00 00 mov $0x1,%eax
8049397: 5e pop %esi
8049398: 5b pop %ebx
8049399: c9 leave
804939a: c2 08 00 ret $0x8
804939d: ff 45 f0 incl -0x10(%ebp)
80493a0: eb b7 jmp 8049359 <_D4test1A7opApplyMFDFKiZiZi+0x45>
80493a2: 31 c0 xor %eax,%eax
80493a4: 5e pop %esi
80493a5: 5b pop %ebx
80493a6: c9 leave
80493a7: c2 08 00 ret $0x8
80493aa: 90 nop
80493ab: 90 nop
The loop body is significantly longer, about 22 instructions when the
exit branch isn't taken, and includes a call to a bounds check routine
per iteration.
I suppose the gdc case has been heavily optimized by gcc's optimizing
backend. :-) Though I'm kinda disappointed that it didn't inline the
trivial delegate. Or is that because of the way the front-end generates
the AST?
T
--
Nothing in the world is more distasteful to a man than to take the path
that leads to himself. -- Herman Hesse
More information about the Digitalmars-d
mailing list