Two optimizations

Mon Jul 27 19:40:42 PDT 2009

在 Tue, 28 Jul 2009 04:51:42 +0800，bearophile <bearophileHUGS at lycos.com>  
写道:

> High-level optimizations have to be done mostly by the front-end because  
> the back-end usually doesn't know about the high-level constructs of D.
>
> So this is the right place to ask for some of such optimizations, while  
> asking for them on the LLVM or LDC IRC channels is not the best thing to  
> do (even if LDC developers usually try to do what I ask them, they are  
> very gentle).
>
> ---------------------
>
> Recently here I've shown a link to some of the optimizations done by the  
> Scala compiler (that the JavaVM back-end isn't able to perform), but  
> that post was ignored. The most important thing it does is to inline  
> most closures of a Scala program. Such optimization is important for  
> functional-style programming, because (like tail-call optimization) if  
> present allows the programmer to use certain idioms with more freedom  
> (like passing delegates to higher order functions, etc), essentially  
> allowing a different programming style. Probably this optimization isn't  
> easy to implement, it requires some code. Is someone interested in  
> adding this to D?
>
> ---------------------
>
> This comes from successive simplification of some code of mine, it's a  
> loop with a kernel over an array, a common operation in my kind of code:
>
> version(Tango) import tango.stdc.stdio: printf;
> void main() {
>     auto a = new int[30];
>     for (int x = 2; x < a.length-2; x++)
>         foreach (s; [-2, -1, 0, 1, 2])
>             a[x] += a[x + s];
>     printf("%d\n", a[5]);
> }
>
> This is the asm produced by LDC of the inner foreach loop:
>
> .LBB1_1:
> 	movl	$4294967294, 132(%esp)
> 	movl	$4294967295, 136(%esp)
> 	movl	$0, 140(%esp)
> 	movl	$1, 144(%esp)
> 	movl	$2, 148(%esp)
> 	movl	20(%esp,%eax,4), %ecx
> 	addl	12(%esp,%eax,4), %ecx
> 	movl	%ecx, 20(%esp,%eax,4)
> 	addl	16(%esp,%eax,4), %ecx
> 	movl	%ecx, 20(%esp,%eax,4)
> 	addl	%ecx, %ecx
> 	movl	%ecx, 20(%esp,%eax,4)
> 	addl	24(%esp,%eax,4), %ecx
> 	movl	%ecx, 20(%esp,%eax,4)
> 	addl	28(%esp,%eax,4), %ecx
> 	movl	%ecx, 20(%esp,%eax,4)
> 	incl	%eax
> 	cmpl	$26, %eax
>
>
> The [-2, -1, 0, 1, 2] array is immutable (and the variable 's' of the  
> foreach isn't by ref), so can't the following initializations of such  
> array moved out of the inner loop?
>
> 	movl	$4294967294, 132(%esp)
> 	movl	$4294967295, 136(%esp)
> 	movl	$0, 140(%esp)
> 	movl	$1, 144(%esp)
> 	movl	$2, 148(%esp)
>
>
> This is a variant of the same code:
>
> version(Tango) import tango.stdc.stdio: printf;
> template Tuple(T...) { alias T Tuple; }
> alias Tuple!(-2, -1, 0, 1, 2) move;
> void main() {
>     auto a = new int[30];
>     for (int x = 2; x < a.length-2; x++)
>         foreach (s; move)
>             a[x] += a[x + s];
>     printf("%d\n", a[5]);
> }
>
> The relative asm is much better:
>
> main:
> 	pushl	%edi
> 	subl	$128, %esp
> 	xorl	%eax, %eax
> 	movl	$30, %ecx
> 	leal	8(%esp), %edi
> 	rep;stosl
> 	xorl	%eax, %eax
> 	.align	16
> .LBB1_1:
> 	movl	16(%esp,%eax,4), %ecx
> 	addl	8(%esp,%eax,4), %ecx
> 	movl	%ecx, 16(%esp,%eax,4)
> 	addl	12(%esp,%eax,4), %ecx
> 	addl	%ecx, %ecx
> 	movl	%ecx, 16(%esp,%eax,4)
> 	addl	20(%esp,%eax,4), %ecx
> 	movl	%ecx, 16(%esp,%eax,4)
> 	addl	24(%esp,%eax,4), %ecx
> 	movl	%ecx, 16(%esp,%eax,4)
> 	incl	%eax
> 	cmpl	$26, %eax
> 	jne	.LBB1_1
> 	movl	28(%esp), %eax
> 	movl	%eax, 4(%esp)
> 	movl	$.str, (%esp)
> 	call	printf
> 	xorl	%eax, %eax
> 	addl	$128, %esp
> 	popl	%edi
> 	ret	$8
>
>
>
>
> This is a less reduced version of the code, that I'd like the D (LDC)  
> compiler to optimize very well:
>
> version(Tango) import tango.stdc.stdio: printf;
>
> struct P {
>     int x, y;
>     P opAdd(P o) { return P(x+o.x, y+o.y); }
> }
>
> struct Rect {
>     int lx, ly;
>     int opIn_r(P p) {
>         return p.x >= 0 && p.x < lx && p.y >= 0 && p.y < ly;
>     }
> }
>
> void main() {
>     const int SIZE = 20;
>     auto m = new int[][](SIZE, SIZE);
>     auto p = P(10, 10);
>     // there's another loop for the rows here
>     for (int i; i < SIZE; i++)
>         foreach (shift;  
> [P(-2,-1),P(-1,-2),P(1,-2),P(2,-1),P(2,1),P(1,2),P(-1,2),P(-2,1)])
>             if (shift + p in Rect(SIZE, SIZE))
>                 printf("OK\n");
> }
>
>
> Eventually the compiler can produce asm similar to (well, this uses an  
> uint to avoid testing >= 0, I don't think the LDC compiler will soon  
> learn this trick too):
>
> template Tuple(T...) { alias T Tuple; }
> alias Tuple!(-1,-2,-2,-1,+1,+2,+2,+1) movex;
> alias Tuple!(-2,-1,+1,+2,+2,+1,-1,-2) movey;
> foreach (uint i, sx; movex)
>     if (x + sx < SIZE && y + movey[i] < SIZE)
>         printf("OK\n");
>
>
> If you need more info please ask.
>
> Bye,
> bearophile

It's better to be in bugzilla. I think this is not at the highest priority  
right now.

Simple implementation idea is checking everything in a loop to see if some  
could be compile-time constants. Then rewrite the loop when further  
optimization flags are supplied to the compiler. This could mess up the  
LoC information for debugging.

-- 
使用 Opera 革命性的电子邮件客户程序: http://www.opera.com/mail/