[Issue 10523] New: Don't call array op functions for short vector ops
d-bugmail at puremagic.com
d-bugmail at puremagic.com
Mon Jul 1 15:37:09 PDT 2013
http://d.puremagic.com/issues/show_bug.cgi?id=10523
Summary: Don't call array op functions for short vector ops
Product: D
Version: D2
Platform: All
OS/Version: All
Status: NEW
Severity: enhancement
Priority: P2
Component: DMD
AssignedTo: nobody at puremagic.com
ReportedBy: bearophile_hugs at eml.cc
--- Comment #0 from bearophile_hugs at eml.cc 2013-07-01 15:37:08 PDT ---
The D front-end performs some vector ops like a[]+b[] with a call to functions
like __arraySliceSliceAddSliceAssign_d. For more complex array operations like
a[]+b[]+c[] there is no available function, so the D front-end replaces them
with a loop.
When the vector ops are done on fixed sized arrays that are small, the back-end
of gcd and ldc2 compilers are able to produce (very) efficient code from a
loop.
So I suggest to replace _all_ vector ops with a loop when the arrays have a
length known at compile time and such length is small (where small is <= 16).
This avoids to kill the optimization opportunities for ldc2 and gdc back-ends.
And probably improves the performance of the code even for dmd compiler for
arrays like 4 integers or 4 doubles long.
This is not a general solution for this problem, but I think it's very simple
to implement, and improves significantly a wide class of common cases. I can
call it a very low hanging fruit.
- - - - - - - - - - -
Below there is some data to support my request.
The ldc2 compiler is generally a good enough optimizing compiler, it's able to
unroll loops, it should perform some vectorizations or some de-virtualizations,
it will be able to perform link-time optimization, etc.
For this small program:
import core.stdc.stdio: printf;
void main() {
double[4] a = void, b = void, c = void;
a[] = 1.0;
b[] = 2.0;
c[] = a[] + b[];
printf("%f %f %f %f\n", c[0], c[1], c[2], c[3]);
}
ldc2 gives, on a X86:
__Dmain:
pushl %ebp
movl %esp, %ebp
andl $-8, %esp
subl $136, %esp
movl $1072693248, 108(%esp)
movl $0, 104(%esp)
movl $1072693248, 116(%esp)
movl $0, 112(%esp)
movl $1072693248, 124(%esp)
movl $0, 120(%esp)
movl $1072693248, 132(%esp)
movl $0, 128(%esp)
movl $1073741824, 76(%esp)
movl $0, 72(%esp)
movl $1073741824, 84(%esp)
movl $0, 80(%esp)
movl $1073741824, 92(%esp)
movl $0, 88(%esp)
movl $1073741824, 100(%esp)
movl $0, 96(%esp)
leal 104(%esp), %eax
movl %eax, 20(%esp)
leal 72(%esp), %eax
movl %eax, 12(%esp)
leal 40(%esp), %eax
movl %eax, 4(%esp)
movl $4, 16(%esp)
movl $4, 8(%esp)
movl $4, (%esp)
calll __arraySliceSliceAddSliceAssign_d
movsd 40(%esp), %xmm0
movsd 48(%esp), %xmm1
movsd 56(%esp), %xmm2
movsd 64(%esp), %xmm3
movsd %xmm3, 28(%esp)
movsd %xmm2, 20(%esp)
movsd %xmm1, 12(%esp)
movsd %xmm0, 4(%esp)
movl $_.str, (%esp)
calll ___mingw_printf
xorl %eax, %eax
movl %ebp, %esp
popl %ebp
ret
The first group of movl are the initializations of the two arrays "a", "b" (dmd
calls __memset64 for this. But for such small arrays this can be a bit faster):
movl $1072693248, 108(%esp)
movl $0, 104(%esp)
movl $1072693248, 116(%esp)
movl $0, 112(%esp)
movl $1072693248, 124(%esp)
movl $0, 120(%esp)
movl $1072693248, 132(%esp)
movl $0, 128(%esp)
movl $1073741824, 76(%esp)
movl $0, 72(%esp)
movl $1073741824, 84(%esp)
movl $0, 80(%esp)
movl $1073741824, 92(%esp)
movl $0, 88(%esp)
movl $1073741824, 100(%esp)
Then there is a preparation for calling a druntime function that performs the
vector op, including the array lengths:
movl $4, 16(%esp)
movl $4, 8(%esp)
movl $4, (%esp)
The call:
calll __arraySliceSliceAddSliceAssign_d
And then it prepares the call to printf (I don't know why there are 8 movsd
instead of 4):
movsd 40(%esp), %xmm0
movsd 48(%esp), %xmm1
movsd 56(%esp), %xmm2
movsd 64(%esp), %xmm3
movsd %xmm3, 28(%esp)
movsd %xmm2, 20(%esp)
movsd %xmm1, 12(%esp)
movsd %xmm0, 4(%esp)
movl $_.str, (%esp)
calll ___mingw_printf
This is a similar program that uses a loop instead of vector ops:
import core.stdc.stdio: printf;
__gshared double x0 = 1.0,
x1 = 2.0,
x2 = 3.0,
x3 = 4.0,
y0 = 10.0,
y1 = 20.0,
y2 = 30.0,
y3 = 40.0;
void main() {
double[4] a = [x0, x1, x2, x3];
double[4] b = [y0, y1, y2, y3];
double[4] c = void;
foreach (i; 0 .. 4)
c[i] = a[i] + b[i];
printf("%f %f %f %f\n", c[0], c[1], c[2], c[3]);
}
Currently LLVM has some vectorization capabilities, but apparently ldc2 is not
able to use two addpd instructions (or one vaddpd) like this:
movapd 208(%esp), %xmm0
movapd 224(%esp), %xmm1
addpd 176(%esp), %xmm0
addpd 192(%esp), %xmm1
movhpd %xmm1, 28(%esp)
movlpd %xmm1, 20(%esp)
movhpd %xmm0, 12(%esp)
movlpd %xmm0, 4(%esp)
movl $_.str, (%esp)
calll ___mingw_printf
So currently ldc2 generates code like this for the manual loop, with four
addsd:
movsd 136(%esp), %xmm0
movsd 144(%esp), %xmm1
addsd 104(%esp), %xmm0
addsd 112(%esp), %xmm1
movsd 152(%esp), %xmm2
addsd 120(%esp), %xmm2
movsd 160(%esp), %xmm3
addsd 128(%esp), %xmm3
movsd %xmm3, 28(%esp)
movsd %xmm2, 20(%esp)
movsd %xmm1, 12(%esp)
movsd %xmm0, 4(%esp)
movl $_.str, (%esp)
calll ___mingw_printf
Even four addsd are faster than the call to __arraySliceSliceAddSliceAssign_d.
I think the cause of the problem is that the back-end of ldc2 doesn't receive
the lengths (that are 4) of the arrays, that are known at compile-time. A
simple solution is just to not call functions like
__arraySliceSliceAddSliceAssign_d when the length of the arrays is known to be
small, and just inline a small loop. The D front-end is already able to do this
when there is no run-time function available:
import core.stdc.stdio: printf;
__gshared double x0 = 1.0,
x1 = 2.0,
x2 = 3.0,
x3 = 4.0,
y0 = 10.0,
y1 = 20.0,
y2 = 30.0,
y3 = 40.0,
z0 = 100.0,
z1 = 200.0,
z2 = 300.0,
z3 = 400.0;
void main() {
double[4] a = [x0, x1, x2, x3];
double[4] b = [y0, y1, y2, y3];
double[4] c = [z0, z1, z2, z3];
double[4] d = void;
foreach (i; 0 .. 4)
d[i] = a[i] + b[i] + c[i];
printf("%f %f %f %f\n", d[0], d[1], d[2], d[3]);
}
movsd 200(%esp), %xmm0
movsd 208(%esp), %xmm1
addsd 168(%esp), %xmm0
addsd 176(%esp), %xmm1
addsd 144(%esp), %xmm1
movsd 216(%esp), %xmm2
addsd 184(%esp), %xmm2
addsd 152(%esp), %xmm2
movsd 224(%esp), %xmm3
addsd 192(%esp), %xmm3
addsd 160(%esp), %xmm3
movsd 136(%esp), %xmm4
movsd %xmm3, 28(%esp)
movsd %xmm2, 20(%esp)
movsd %xmm1, 12(%esp)
addsd %xmm0, %xmm4
movsd %xmm4, 4(%esp)
movl $_.str, (%esp)
calll ___mingw_printf
--
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
More information about the Digitalmars-d-bugs
mailing list