Replacing C's memcpy with a D implementation
Basile B.
b2.b2.temp.temp at gmx.gmx.com.com.com
Mon Jun 11 03:34:59 UTC 2018
On Monday, 11 June 2018 at 01:03:16 UTC, Mike Franklin wrote:
> I've modified the test based on the feedback so far, so here's
> what it looks like now:
>
> import std.datetime.stopwatch;
> import std.stdio;
> import core.stdc.string;
> import std.random;
> import std.algorithm;
>
> enum length = 4096 * 2;
>
> void init(ref ubyte[] a)
> {
> a.length = length;
>
> for(int i = 0; i < length; i++)
> {
> a[i] = uniform!ubyte;
> }
> }
>
> void verifyResults(ubyte[] a, ubyte[] b)
> {
> assert(memcmp(a.ptr, b.ptr, length) == 0);
> }
>
> void memcpyD(ubyte[] dst, ubyte[] src)
> {
> dst[] = src[];
> }
>
> void memcpyDstdAlg(ubyte[] dst, ubyte[] src)
> {
> copy(src, dst);
> }
>
> void memcpyC(ubyte[] dst, ubyte[] src)
> {
> memcpy(dst.ptr, src.ptr, length);
> }
>
> void memcpyNaive(ubyte[] dst, ubyte[] src)
> {
> for(int i = 0; i < length; i++)
> {
> dst[i] = src[i];
> }
> }
>
> void memcpyASM(ubyte[] dst, ubyte[] src)
> {
> auto s = src.ptr;
> auto d = dst.ptr;
> size_t len = length;
> asm pure nothrow @nogc
> {
> mov RSI, s;
> mov RDI, d;
> cld;
> mov RCX, len;
> rep;
> movsb;
> }
> }
>
> Duration benchmark(alias f)(ubyte[] dst, ubyte[] src, uint n)
> {
> Duration result;
> auto sw = StopWatch(AutoStart.yes);
>
> sw.reset();
> foreach (_; 0 .. n)
> {
> f(dst, src);
> }
> result = sw.peek();
>
> return result;
> }
>
> void main()
> {
> ubyte[] src;
> ubyte[] dst;
>
> // verify the integrity of the algorithm
> init(src);
> init(dst);
> memcpyD(dst, src);
> verifyResults(dst, src);
>
> init(src);
> init(dst);
> memcpyDstdAlg(dst, src);
> verifyResults(dst, src);
>
> init(src);
> init(dst);
> memcpyC(dst, src);
> verifyResults(dst, src);
>
> init(src);
> init(dst);
> memcpyNaive(dst, src);
> verifyResults(dst, src);
>
> init(src);
> init(dst);
> memcpyASM(dst, src);
> verifyResults(dst, src);
>
> // test the performance of the algorithm
> enum iterations = 1000;
> writeln("memcpyD: ", benchmark!memcpyD(dst, src,
> iterations));
> writeln("memcpyDstdAlg: ", benchmark!memcpyDstdAlg(dst,
> src, iterations));
> writeln("memcpyC: ", benchmark!memcpyC(dst, src,
> iterations));
> writeln("memcpyNaive: ", benchmark!memcpyNaive(dst, src,
> iterations));
> writeln("memcpyASM: ", benchmark!memcpyASM(dst, src,
> iterations));
> }
>
> The results on my Windows 10 machine (Intel Core i7-6700,
> 3.4GHz):
> memcpyD: 127 ╬╝s and 3 hnsecs
> memcpyDstdAlg: 195 ╬╝s and 9 hnsecs
> memcpyC: 126 ╬╝s and 7 hnsecs
> memcpyNaive: 17 ms, 974 ╬╝s, and 9 hnsecs
> memcpyASM: 122 ╬╝s and 8 hnsecs
> (Gotta love how windows displays μ)
>
> The results running on Arch Linux 64-bit in a VirtualBox on the
> same Windows 10 machine:
> memcpyD: 409 μs
> memcpyDstdAlg: 400 μs
> memcpyC: 404 μs and 4 hnsecs
> memcpyNaive: 17 ms, 251 μs, and 6 hnsecs
> memcpyASM: 162 μs and 8 hnsecs
>
> The results appear more sane now, but it seems the behavior is
> highly platform dependent. Still the ASM is doing well for my
> hardware. If I run the test multiple times, I do see a lot of
> noise in the results, but each test seems to be affected
> proportionally, so I'm gaining a little more confidence in the
> benchmark.
>
> I still need to analyze the assembly of C's memcpy (anyone know
> where I can find the source code?),
- default win32 OMF:
https://github.com/DigitalMars/dmc/blob/master/src/core/MEMCCPY.C
- default linux:
https://github.com/gcc-mirror/gcc/blob/master/libgcc/memcpy.c
- not used but interesting:
https://github.com/esmil/musl/blob/master/src/string/memcpy.c
More information about the Digitalmars-d
mailing list