Naive node.js faster than naive LDC2?

H. S. Teoh hsteoh at quickfur.ath.cx
Sat Aug 22 16:01:52 UTC 2020


On Sat, Aug 22, 2020 at 12:21:50AM +0000, James Lu via Digitalmars-d wrote:
[...]
> I showed with and without class.

Sorry, I missed that the first time round.  But how come your struct
version uses `real` but your class version uses `double`?  It's
well-known that `real` is slow because it uses x87 instructions, as
opposed to the SSE/etc. instructions that `double` would use.


> V8's analyzer might be superior to LDC's in removing the allocation
> overhead. I used the same compilation flags as the original:
> 
> ldc2 -release -mcpu=native -O3 -ffast-math --fp-contract=fast

I found that with -ffast-math --fp-contract=fast, performance doubled.
Since James' original struct version uses real, I decided to do a
comparison between real and double in addition to class vs. struct:

class version, with real:
	ldc2 -d-version=useClass -d-version=useReal -g -ffast-math --fp-contract=fast -mcpu=native -O3 test.d -of=test-class-native

	8 secs, 201 ms, 286 μs, and 9 hnsecs
	8 secs, 153 ms, 617 μs, and 9 hnsecs
	8 secs, 205 ms, 966 μs, and 6 hnsecs

class version, with double:
	ldc2 -d-version=useClass -d-version=useDouble -g -ffast-math --fp-contract=fast -mcpu=native -O3 test.d -of=test-class-native

	4 secs, 177 ms, 842 μs, and 3 hnsecs
	4 secs, 297 ms, 899 μs, and 6 hnsecs
	4 secs, 221 ms, 916 μs, and 7 hnsecs

struct version, with real:
	ldc2 -d-version=useStruct -d-version=useReal -g -ffast-math --fp-contract=fast -mcpu=native -O3 test.d -of=test-class-native

	3 secs, 191 ms, 21 μs, and 4 hnsecs
	3 secs, 223 ms, 692 μs, and 9 hnsecs
	3 secs, 210 ms, 429 μs, and 2 hnsecs

struct version, with double:
	ldc2 -d-version=useStruct -d-version=useDouble -g -ffast-math --fp-contract=fast -mcpu=native -O3 test.d -of=test-class-native

	2 secs, 659 ms, 309 μs, and 2 hnsecs
	2 secs, 654 ms, 96 μs, and 3 hnsecs
	2 secs, 630 ms, 84 μs, and 4 hnsecs

As you can see, using struct vs class grants almost double the
performance.  Using double with struct instead of real with struct gives
17% improvement.

The difference between struct and class is not surprising; allocations
are slow in general, and D generally does not do very much optimizations
of allocations.  Node.js being Javascript-based, and Javascript being
object-heavy, it's not surprising that more object lifetime analysis
would be applied to optimize allocations.

I do feel James' struct implementation was flawed, though, because of
using real instead of double, real being known to be slow on modern
hardware. (Also, comparing struct + real to class + double seems a bit
like comparing apples and oranges.)  My original modification of James'
code uses struct + double, and comparing that with struct + real showed
a 17% degradation upon switching to real.

//

As a further step, I profiled the program and found that most of the
time was being spent calling the C math library's fmax() function (which
involves an expensive PIC indirection, not to mention lack of inlining).
Writing a naïve version of fmax() in D gave the following numbers:

struct version, with double + custom fmax function:
	ldc2 -d-version=useStruct -d-version=useDouble -d-version=customFmax -g -ffast-math --fp-contract=fast -mcpu=native -O3 test.d -of=test-struct-native

	1 sec, 567 ms, 219 μs, and 6 hnsecs
	1 sec, 557 ms, 762 μs, and 7 hnsecs
	1 sec, 574 ms, 657 μs, and 7 hnsecs

This represents a whopping 40% improvement over the version calling the
C library's fmax function.

I wonder how this last version compares with the Node.js performance?


//

Code, for full disclosure (basically copy-n-pasted from James' code,
with minor modifications for testing struct vs class, real vs double):

------------------------------------
// 
import std.stdio;
import std.math;
import core.time;

version(useReal) alias Num = real;
version(useDouble) alias Num = double;

version(customFmax)
	Num fmax(Num x, Num y) { return (x < y) ? y : x; }

version(useStruct)
{
	struct Complex {
	    Num x;
	    Num y;
	    this(A)(A px, A py) {
		this.x = px;
		this.y = py;
	    }
	    unittest {
		auto complex = Complex(2, 2);
		assert(complex.x == 2 && complex.y == 2);
	    }
	    auto abs() const {
		return fmax(this.x * this.x, this.y * this.y);
	    }

	    void add(T)(const T other) {
		this.x += other.x;
		this.y += other.y;
	    }

	    void mul(T)(const T other) {
		auto newX = this.x * other.x - this.y * other.y;
		auto newY = this.x * other.y + this.y * other.x;
		this.x = newX;
		this.y = newY;
	    }
	}
	unittest {
	    auto c = Complex(5, 3);
	    c.mul(Complex(4, 2));
	    assert(c.x == 14 && c.y == 22);
	}
	unittest {
	    auto org = Complex(0, 0);
	    org.add(Complex(3, 3));
	    assert(org.x == 3 && org.y == 3);
	}

	auto iterate_mandelbrot(const Complex c, const int maxIters) {
	    auto z = Complex(0, 0);
	    for (int i = 0; i < maxIters; i++) {
		if (z.abs() >= 2.0) {
		    return i;
		}
		z.mul(z);
		z.add(c);
	    }
	    return maxIters;
	}

	const x0 = -2.5, x1 = 1, y0 = -1, y1 = 1;
	const cols = 72, rows = 24;
	const maxIters = 1000000;

	void main() {
		auto now = MonoTime.currTime;
	    for (Num row = 0; row < rows; row++) {
		const y = (row / rows) * (y1 - y0) + y0;
		char[] str;
		for (Num col = 0; col < cols; col++) {
		    // Num is needed here because otherwise "/" does integer division
		    const x = (col / cols) * (x1 - x0) + x0;
		    auto c = Complex(x, y);
		    auto iters = iterate_mandelbrot(c, maxIters);
		    if (iters == 0) {
			str ~= '.';
		    } else if (iters == 1) {
			str ~= '%';
		    } else if (iters == 2) {
			str ~= '@';
		    } else if (iters == maxIters) {
			str ~= ' ';
		    } else {
			str ~= '#';
		    }
		}
		str.writeln;
	    }
	    writeln(MonoTime.currTime - now);
	}
}

version(useClass)
{
	class Complex {
	    Num x;
	    Num y;
	    this(A)(A px, A py) {
		this.x = px;
		this.y = py;
	    }
	    unittest {
		auto complex = new Complex(2, 2);
		assert(complex.x == 2 && complex.y == 2);
	    }
	    auto abs() const {
		return fmax(this.x * this.x, this.y * this.y);
	    }

	    void add(T)(const T other) {
		this.x += other.x;
		this.y += other.y;
	    }

	    void mul(T)(const T other) {
		auto newX = this.x * other.x - this.y * other.y;
		auto newY = this.x * other.y + this.y * other.x;
		this.x = newX;
		this.y = newY;
	    }
	}
	unittest {
	    auto c = new Complex(5, 3);
	    c.mul(new Complex(4, 2));
	    assert(c.x == 14 && c.y == 22);
	}
	unittest {
	    auto org = new Complex(0, 0);
	    org.add(new Complex(3, 3));
	    assert(org.x == 3 && org.y == 3);
	}

	auto iterate_mandelbrot(const Complex c, const int maxIters) {
	    auto z = new Complex(0, 0);
	    for (int i = 0; i < maxIters; i++) {
		if (z.abs() >= 2.0) {
		    return i;
		}
		z.mul(z);
		z.add(c);
	    }
	    return maxIters;
	}

	const x0 = -2.5, x1 = 1, y0 = -1, y1 = 1;
	const cols = 72, rows = 24;
	const maxIters = 1000000;

	void main() {
		auto now = MonoTime.currTime;
	    for (Num row = 0; row < rows; row++) {
		const y = (row / rows) * (y1 - y0) + y0;
		char[] str;
		for (Num col = 0; col < cols; col++) {
		    // Num is needed here because otherwise "/" does integer division
		    const x = (col / cols) * (x1 - x0) + x0;
		    auto c = new Complex(x, y);
		    auto iters = iterate_mandelbrot(c, maxIters);
		    if (iters == 0) {
			str ~= '.';
		    } else if (iters == 1) {
			str ~= '%';
		    } else if (iters == 2) {
			str ~= '@';
		    } else if (iters == maxIters) {
			str ~= ' ';
		    } else {
			str ~= '#';
		    }
		}
		str.writeln;
	    }
	    writeln(MonoTime.currTime - now);
	}
}
------------------------------------


T

-- 
MAS = Mana Ada Sistem?


More information about the Digitalmars-d mailing list