Cross module inlining in runtime

Artur Skawina art.08.09 at gmail.com
Tue Jan 10 15:38:44 PST 2012


On 01/11/12 00:30, Iain Buclaw wrote:
> On 10 January 2012 19:49, Artur Skawina <art.08.09 at gmail.com> wrote:
>>> I have porting the runtime/phobos asms to gcc asm on my to-do list, will
>>> try to get to that within two weeks. What would be the preferred way -
>>> version() guards? if yes - what version? Or would you prefer replacing
>>> the asms, if the changes are not going to be merged upstream anyway?
>>
>> So i decided to start with this today. As i have a case where turning on
>> logging increases a programs run time from seconds to hours, while it
>> spends most of the time in the GC, thought gcbits would be a good place
>> to start.
>>
>> But after adding gdc asm support to GCBits.testClear() the only thing that
>> changed was this:
>>
>> XXXXXXXX <uint gc.gcbits.GCBits.testClear(uint)>:
>>                push   %eRX
>>                mov    %eRX,%eRX
>>                mov    XX(%eRX),%eRX
>> -               push   %eRX
>> -               mov    %eRX,%eRX
>> -               shr    $0x5,%eRX
>> -               lea    XX(,%eRX,4),%eRX
>> -               mov    XX(%eRX),%eRX
>> -               add    (%eRX),%eRX
>> -               mov    $0x1,%eRX
>> -               shl    %Rl,%eRX
>> -               mov    %eRX,%eRX
>>                mov    (%eRX),%eRX
>> -               not    %eRX
>> -               and    %eRX,%eRX
>> -               and    %eRX,%eRX
>> -               mov    %eRX,(%eRX)
>> -               pop    %eRX
>> +               mov    XX(%eRX),%eRX
>> +               btr    %eRX,XX(%eRX)
>> +               sbb    %eRX,%eRX
>>                pop    %eRX
>>                ret
>>
>> OK, the function turned into ~ three instructions, good, but why didn't it
>> then get inlined into any of the callers? Trying to force things with an
>> attribute turned up this:
>>
> 
> four instructions. :~)

I'm already imagining the inlined case, where the "mov" could be free. :)


>> ../../../libphobos/gc/gcx.d: In member function 'gc.gcx.Gcx.fullcollect':
>> BUILD32/gdc/dev/gcc-4.6.1/libphobos/gc/gcbits.d:119:0: sorry, unimplemented: inlining failed in call to 'testClear': function body not available
>> ../../../libphobos/gc/gcx.d:2647:0: sorry, unimplemented: called from here
>> BUILD32/gdc/dev/gcc-4.6.1/libphobos/gc/gcbits.d:119:0: sorry, unimplemented: inlining failed in call to 'testClear': function body not available
>> ../../../libphobos/gc/gcx.d:2729:0: sorry, unimplemented: called from here
>> make[3]: *** [gc/gcx.o] Error 1
>>
>> Any way to make this work? Much of the asm gains will be lost when the code
>> isn't inlined.

> How is the function written?

--- druntime/gc/gcbits.d.org	2012-01-10 19:56:11.580039157 +0100
+++ druntime/gc/gcbits.d	2012-01-10 23:19:50.046264596 +0100
@@ -29,7 +29,10 @@ version (DigitalMars)
 }
 else version (GNU)
 {
-    // use the unoptimized version
+    version(X86)
+       version = GNU_Asm_x86;
+    version(X86_64)
+       version = GNU_Asm_x86;
 }
 else version (D_InlineAsm_X86)
 {
@@ -115,6 +118,7 @@ struct GCBits
         data[1 + (i >> BITS_SHIFT)] &= ~(BITS_1 << (i & BITS_MASK));
     }
 
+    //pragma(attribute, always_inline)
     wordtype testClear(size_t i)
     {
         version (bitops)
@@ -133,6 +137,19 @@ struct GCBits
                 ret     4               ;
             }
         }
+        else version (GNU_Asm_x86)
+        {
+	    wordtype result = void;
+            asm
+            {
+	       "btr %2, %0; sbb %1,%1"
+	       : "+m" *(data+1), "=r" result
+	       : "Ir" i
+	       : "memory" ;
+	       
+            }
+	    return result;
+        }
         else
         {
             //result = (cast(bit *)(data + 1))[i];
@@ -164,6 +181,19 @@ struct GCBits
                 ret     4               ;
             }
         }
+        else version (GNU_Asm_x86)
+        {
+	    wordtype result = void;
+            asm
+            {
+	       "bts %2, %0; sbb %1,%1"
+	       : "+m" *(data+1), "=r" result
+	       : "Ir" i
+	       : "memory" ;
+	       
+            }
+	    return result;
+        }
         else
         {
             //result = (cast(bit *)(data + 1))[i];


More information about the D.gnu mailing list