Inlining problems again

Artur Skawina art.08.09 at gmail.com
Fri Apr 4 02:52:13 PDT 2014


Built latest gcc4-8-based gdc (fdf0c614) today; immediately ran into this:

This program:

   import std.array, std.algorithm;
   int main(string[] argv) {
      auto r = argv.filter!"a.length"().count();
      return r&255;
   }

compiles to:

00000000004052e0 <pure nothrow @property @safe bool std.array.empty!(immutable(char)[]).empty(const(immutable(char)[][]))>:
  4052e0:       48 85 ff                test   %rdi,%rdi
  4052e3:       0f 94 c0                sete   %al
  4052e6:       c3                      retq   

00000000004052f0 <_Dmain>:
  4052f0:       41 54                   push   %r12
  4052f2:       49 89 f4                mov    %rsi,%r12
  4052f5:       55                      push   %rbp
  4052f6:       48 89 fd                mov    %rdi,%rbp
  4052f9:       53                      push   %rbx
  4052fa:       48 83 ec 10             sub    $0x10,%rsp
  4052fe:       48 89 ef                mov    %rbp,%rdi
  405301:       4c 89 e6                mov    %r12,%rsi
  405304:       4c 89 e3                mov    %r12,%rbx
  405307:       e8 d4 ff ff ff          callq  4052e0 <pure nothrow @property @safe bool std.array.empty!(immutable(char)[]).empty(const(immutable(char)[][]))>
  40530c:       84 c0                   test   %al,%al
  40530e:       75 18                   jne    405328 <_Dmain+0x38>
  405310:       48 8d 45 ff             lea    -0x1(%rbp),%rax
  405314:       49 83 c4 10             add    $0x10,%r12
  405318:       48 83 3b 00             cmpq   $0x0,(%rbx)
  40531c:       75 0a                   jne    405328 <_Dmain+0x38>
  40531e:       48 89 c5                mov    %rax,%rbp
  405321:       eb db                   jmp    4052fe <_Dmain+0xe>
  405323:       0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  405328:       48 89 ef                mov    %rbp,%rdi
  40532b:       48 89 de                mov    %rbx,%rsi
  40532e:       45 31 e4                xor    %r12d,%r12d
  405331:       e8 aa ff ff ff          callq  4052e0 <pure nothrow @property @safe bool std.array.empty!(immutable(char)[]).empty(const(immutable(char)[][]))>
  405336:       84 c0                   test   %al,%al
  405338:       75 2a                   jne    405364 <_Dmain+0x74>
  40533a:       48 83 ed 01             sub    $0x1,%rbp
  40533e:       48 83 c3 10             add    $0x10,%rbx
  405342:       48 89 ef                mov    %rbp,%rdi
  405345:       48 89 de                mov    %rbx,%rsi
  405348:       e8 93 ff ff ff          callq  4052e0 <pure nothrow @property @safe bool std.array.empty!(immutable(char)[]).empty(const(immutable(char)[][]))>
  40534d:       84 c0                   test   %al,%al
  40534f:       74 27                   je     405378 <_Dmain+0x88>
  405351:       48 89 ef                mov    %rbp,%rdi
  405354:       48 89 de                mov    %rbx,%rsi
  405357:       49 83 c4 01             add    $0x1,%r12
  40535b:       e8 80 ff ff ff          callq  4052e0 <pure nothrow @property @safe bool std.array.empty!(immutable(char)[]).empty(const(immutable(char)[][]))>
  405360:       84 c0                   test   %al,%al
  405362:       74 d6                   je     40533a <_Dmain+0x4a>
  405364:       48 83 c4 10             add    $0x10,%rsp
  405368:       41 0f b6 c4             movzbl %r12b,%eax
  40536c:       5b                      pop    %rbx
  40536d:       5d                      pop    %rbp
  40536e:       41 5c                   pop    %r12
  405370:       c3                      retq   
  405378:       48 83 3b 00             cmpq   $0x0,(%rbx)
  40537c:       48 8d 55 ff             lea    -0x1(%rbp),%rdx
  405380:       75 cf                   jne    405351 <_Dmain+0x61>
  405382:       48 89 d5                mov    %rdx,%rbp
  405385:       eb b7                   jmp    40533e <_Dmain+0x4e>
 
That trivial std.array.empty() function is for some reason not getting
inlined, leading to the catastrophic result above.

Also checked with an old 4.6 based gdc; that one manages to inline
everything when using LTO (which the newer gdcs can't handle).

If the array is wrapped in a custom range like this:

   import std.array, std.algorithm;
   auto range(E)(E[] e) @property {
      static struct AR {
         E[] arr;

         inout(E) front() inout @property { return arr[0]; }
         bool empty() const @property { return !arr.length; }
         void popFront() { arr = arr[1..$]; }

         size_t length() @property const { return arr.length; }
      }
      return AR(e);
   }
   int main(string[] argv) {
      auto r = argv.range.filter!"a.length"().count();
      return r&255;
   }

and compiled with the same (current 4.8-based) compiler and an identical
cmdline, then the result turns into:

00000000004052e0 <_Dmain>:
  4052e0:       48 85 ff                test   %rdi,%rdi
  4052e3:       75 03                   jne    4052e8 <_Dmain+0x8>
  4052e5:       31 c0                   xor    %eax,%eax
  4052e7:       c3                      retq   
  4052e8:       48 83 3e 00             cmpq   $0x0,(%rsi)
  4052ec:       75 10                   jne    4052fe <_Dmain+0x1e>
  4052ee:       48 83 c6 10             add    $0x10,%rsi
  4052f2:       48 83 ef 01             sub    $0x1,%rdi
  4052f6:       74 ed                   je     4052e5 <_Dmain+0x5>
  4052f8:       48 83 3e 00             cmpq   $0x0,(%rsi)
  4052fc:       74 f0                   je     4052ee <_Dmain+0xe>
  4052fe:       31 c0                   xor    %eax,%eax
  405300:       eb 10                   jmp    405312 <_Dmain+0x32>
  405302:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
  405308:       48 83 c6 10             add    $0x10,%rsi
  40530c:       48 83 3e 00             cmpq   $0x0,(%rsi)
  405310:       74 04                   je     405316 <_Dmain+0x36>
  405312:       48 83 c0 01             add    $0x1,%rax
  405316:       48 83 ef 01             sub    $0x1,%rdi
  40531a:       75 ec                   jne    405308 <_Dmain+0x28>
  40531c:       0f b6 c0                movzbl %al,%eax
  40531f:       c3                      retq   

Which is much more reasonable, and shouldn't require such hacks.
I thought that inlining /templated/ functions across modules already worked;
is this a different problem, and, more importantly, is it fixable?

artur



More information about the D.gnu mailing list