Naive node.js faster than naive LDC2?
H. S. Teoh
hsteoh at quickfur.ath.cx
Sat Aug 22 16:01:52 UTC 2020
On Sat, Aug 22, 2020 at 12:21:50AM +0000, James Lu via Digitalmars-d wrote:
[...]
> I showed with and without class.
Sorry, I missed that the first time round. But how come your struct
version uses `real` but your class version uses `double`? It's
well-known that `real` is slow because it uses x87 instructions, as
opposed to the SSE/etc. instructions that `double` would use.
> V8's analyzer might be superior to LDC's in removing the allocation
> overhead. I used the same compilation flags as the original:
>
> ldc2 -release -mcpu=native -O3 -ffast-math --fp-contract=fast
I found that with -ffast-math --fp-contract=fast, performance doubled.
Since James' original struct version uses real, I decided to do a
comparison between real and double in addition to class vs. struct:
class version, with real:
ldc2 -d-version=useClass -d-version=useReal -g -ffast-math --fp-contract=fast -mcpu=native -O3 test.d -of=test-class-native
8 secs, 201 ms, 286 μs, and 9 hnsecs
8 secs, 153 ms, 617 μs, and 9 hnsecs
8 secs, 205 ms, 966 μs, and 6 hnsecs
class version, with double:
ldc2 -d-version=useClass -d-version=useDouble -g -ffast-math --fp-contract=fast -mcpu=native -O3 test.d -of=test-class-native
4 secs, 177 ms, 842 μs, and 3 hnsecs
4 secs, 297 ms, 899 μs, and 6 hnsecs
4 secs, 221 ms, 916 μs, and 7 hnsecs
struct version, with real:
ldc2 -d-version=useStruct -d-version=useReal -g -ffast-math --fp-contract=fast -mcpu=native -O3 test.d -of=test-class-native
3 secs, 191 ms, 21 μs, and 4 hnsecs
3 secs, 223 ms, 692 μs, and 9 hnsecs
3 secs, 210 ms, 429 μs, and 2 hnsecs
struct version, with double:
ldc2 -d-version=useStruct -d-version=useDouble -g -ffast-math --fp-contract=fast -mcpu=native -O3 test.d -of=test-class-native
2 secs, 659 ms, 309 μs, and 2 hnsecs
2 secs, 654 ms, 96 μs, and 3 hnsecs
2 secs, 630 ms, 84 μs, and 4 hnsecs
As you can see, using struct vs class grants almost double the
performance. Using double with struct instead of real with struct gives
17% improvement.
The difference between struct and class is not surprising; allocations
are slow in general, and D generally does not do very much optimizations
of allocations. Node.js being Javascript-based, and Javascript being
object-heavy, it's not surprising that more object lifetime analysis
would be applied to optimize allocations.
I do feel James' struct implementation was flawed, though, because of
using real instead of double, real being known to be slow on modern
hardware. (Also, comparing struct + real to class + double seems a bit
like comparing apples and oranges.) My original modification of James'
code uses struct + double, and comparing that with struct + real showed
a 17% degradation upon switching to real.
//
As a further step, I profiled the program and found that most of the
time was being spent calling the C math library's fmax() function (which
involves an expensive PIC indirection, not to mention lack of inlining).
Writing a naïve version of fmax() in D gave the following numbers:
struct version, with double + custom fmax function:
ldc2 -d-version=useStruct -d-version=useDouble -d-version=customFmax -g -ffast-math --fp-contract=fast -mcpu=native -O3 test.d -of=test-struct-native
1 sec, 567 ms, 219 μs, and 6 hnsecs
1 sec, 557 ms, 762 μs, and 7 hnsecs
1 sec, 574 ms, 657 μs, and 7 hnsecs
This represents a whopping 40% improvement over the version calling the
C library's fmax function.
I wonder how this last version compares with the Node.js performance?
//
Code, for full disclosure (basically copy-n-pasted from James' code,
with minor modifications for testing struct vs class, real vs double):
------------------------------------
//
import std.stdio;
import std.math;
import core.time;
version(useReal) alias Num = real;
version(useDouble) alias Num = double;
version(customFmax)
Num fmax(Num x, Num y) { return (x < y) ? y : x; }
version(useStruct)
{
struct Complex {
Num x;
Num y;
this(A)(A px, A py) {
this.x = px;
this.y = py;
}
unittest {
auto complex = Complex(2, 2);
assert(complex.x == 2 && complex.y == 2);
}
auto abs() const {
return fmax(this.x * this.x, this.y * this.y);
}
void add(T)(const T other) {
this.x += other.x;
this.y += other.y;
}
void mul(T)(const T other) {
auto newX = this.x * other.x - this.y * other.y;
auto newY = this.x * other.y + this.y * other.x;
this.x = newX;
this.y = newY;
}
}
unittest {
auto c = Complex(5, 3);
c.mul(Complex(4, 2));
assert(c.x == 14 && c.y == 22);
}
unittest {
auto org = Complex(0, 0);
org.add(Complex(3, 3));
assert(org.x == 3 && org.y == 3);
}
auto iterate_mandelbrot(const Complex c, const int maxIters) {
auto z = Complex(0, 0);
for (int i = 0; i < maxIters; i++) {
if (z.abs() >= 2.0) {
return i;
}
z.mul(z);
z.add(c);
}
return maxIters;
}
const x0 = -2.5, x1 = 1, y0 = -1, y1 = 1;
const cols = 72, rows = 24;
const maxIters = 1000000;
void main() {
auto now = MonoTime.currTime;
for (Num row = 0; row < rows; row++) {
const y = (row / rows) * (y1 - y0) + y0;
char[] str;
for (Num col = 0; col < cols; col++) {
// Num is needed here because otherwise "/" does integer division
const x = (col / cols) * (x1 - x0) + x0;
auto c = Complex(x, y);
auto iters = iterate_mandelbrot(c, maxIters);
if (iters == 0) {
str ~= '.';
} else if (iters == 1) {
str ~= '%';
} else if (iters == 2) {
str ~= '@';
} else if (iters == maxIters) {
str ~= ' ';
} else {
str ~= '#';
}
}
str.writeln;
}
writeln(MonoTime.currTime - now);
}
}
version(useClass)
{
class Complex {
Num x;
Num y;
this(A)(A px, A py) {
this.x = px;
this.y = py;
}
unittest {
auto complex = new Complex(2, 2);
assert(complex.x == 2 && complex.y == 2);
}
auto abs() const {
return fmax(this.x * this.x, this.y * this.y);
}
void add(T)(const T other) {
this.x += other.x;
this.y += other.y;
}
void mul(T)(const T other) {
auto newX = this.x * other.x - this.y * other.y;
auto newY = this.x * other.y + this.y * other.x;
this.x = newX;
this.y = newY;
}
}
unittest {
auto c = new Complex(5, 3);
c.mul(new Complex(4, 2));
assert(c.x == 14 && c.y == 22);
}
unittest {
auto org = new Complex(0, 0);
org.add(new Complex(3, 3));
assert(org.x == 3 && org.y == 3);
}
auto iterate_mandelbrot(const Complex c, const int maxIters) {
auto z = new Complex(0, 0);
for (int i = 0; i < maxIters; i++) {
if (z.abs() >= 2.0) {
return i;
}
z.mul(z);
z.add(c);
}
return maxIters;
}
const x0 = -2.5, x1 = 1, y0 = -1, y1 = 1;
const cols = 72, rows = 24;
const maxIters = 1000000;
void main() {
auto now = MonoTime.currTime;
for (Num row = 0; row < rows; row++) {
const y = (row / rows) * (y1 - y0) + y0;
char[] str;
for (Num col = 0; col < cols; col++) {
// Num is needed here because otherwise "/" does integer division
const x = (col / cols) * (x1 - x0) + x0;
auto c = new Complex(x, y);
auto iters = iterate_mandelbrot(c, maxIters);
if (iters == 0) {
str ~= '.';
} else if (iters == 1) {
str ~= '%';
} else if (iters == 2) {
str ~= '@';
} else if (iters == maxIters) {
str ~= ' ';
} else {
str ~= '#';
}
}
str.writeln;
}
writeln(MonoTime.currTime - now);
}
}
------------------------------------
T
--
MAS = Mana Ada Sistem?
More information about the Digitalmars-d
mailing list