Replacing C's memcpy with a D implementation

Mon Jun 11 01:03:16 UTC 2018

I've modified the test based on the feedback so far, so here's 
what it looks like now:

import std.datetime.stopwatch;
import std.stdio;
import core.stdc.string;
import std.random;
import std.algorithm;

enum length = 4096 * 2;

void init(ref ubyte[] a)
{
     a.length = length;

     for(int i = 0; i < length; i++)
     {
         a[i] = uniform!ubyte;
     }
}

void verifyResults(ubyte[] a, ubyte[] b)
{
     assert(memcmp(a.ptr, b.ptr, length) == 0);
}

void memcpyD(ubyte[] dst, ubyte[] src)
{
     dst[] = src[];
}

void memcpyDstdAlg(ubyte[] dst, ubyte[] src)
{
     copy(src, dst);
}

void memcpyC(ubyte[] dst, ubyte[] src)
{
     memcpy(dst.ptr, src.ptr, length);
}

void memcpyNaive(ubyte[] dst, ubyte[] src)
{
     for(int i = 0; i < length; i++)
     {
         dst[i] = src[i];
     }
}

void memcpyASM(ubyte[] dst, ubyte[] src)
{
     auto s = src.ptr;
     auto d = dst.ptr;
     size_t len = length;
     asm pure nothrow @nogc
     {
         mov RSI, s;
         mov RDI, d;
         cld;
         mov RCX, len;
         rep;
         movsb;
     }
}

Duration benchmark(alias f)(ubyte[] dst, ubyte[] src, uint n)
{
     Duration result;
     auto sw = StopWatch(AutoStart.yes);

     sw.reset();
     foreach (_; 0 .. n)
     {
         f(dst, src);
     }
     result = sw.peek();

     return result;
}

void main()
{
     ubyte[] src;
     ubyte[] dst;

     // verify the integrity of the algorithm
     init(src);
     init(dst);
     memcpyD(dst, src);
     verifyResults(dst, src);

     init(src);
     init(dst);
     memcpyDstdAlg(dst, src);
     verifyResults(dst, src);

     init(src);
     init(dst);
     memcpyC(dst, src);
     verifyResults(dst, src);

     init(src);
     init(dst);
     memcpyNaive(dst, src);
     verifyResults(dst, src);

     init(src);
     init(dst);
     memcpyASM(dst, src);
     verifyResults(dst, src);

     // test the performance of the algorithm
     enum iterations = 1000;
     writeln("memcpyD: ", benchmark!memcpyD(dst, src, iterations));
     writeln("memcpyDstdAlg: ", benchmark!memcpyDstdAlg(dst, src, 
iterations));
     writeln("memcpyC: ", benchmark!memcpyC(dst, src, iterations));
     writeln("memcpyNaive: ", benchmark!memcpyNaive(dst, src, 
iterations));
     writeln("memcpyASM: ", benchmark!memcpyASM(dst, src, 
iterations));
}

The results on my Windows 10 machine (Intel Core i7-6700, 3.4GHz):
memcpyD: 127 ╬╝s and 3 hnsecs
memcpyDstdAlg: 195 ╬╝s and 9 hnsecs
memcpyC: 126 ╬╝s and 7 hnsecs
memcpyNaive: 17 ms, 974 ╬╝s, and 9 hnsecs
memcpyASM: 122 ╬╝s and 8 hnsecs
(Gotta love how windows displays μ)

The results running on Arch Linux 64-bit in a VirtualBox on the 
same Windows 10 machine:
memcpyD: 409 μs
memcpyDstdAlg: 400 μs
memcpyC: 404 μs and 4 hnsecs
memcpyNaive: 17 ms, 251 μs, and 6 hnsecs
memcpyASM: 162 μs and 8 hnsecs

The results appear more sane now, but it seems the behavior is 
highly platform dependent.  Still the ASM is doing well for my 
hardware.  If I run the test multiple times, I do see a lot of 
noise in the results, but each test seems to be affected 
proportionally, so I'm gaining a little more confidence in the 
benchmark.

I still need to analyze the assembly of C's memcpy (anyone know 
where I can find the source code?), test on more platforms, and 
test varying sizes, but I'm just collecting some initial data 
right now, to learn how to proceed.

I'd be interested in those with other platforms reporting back 
their results for their hardware, and of course suggestions for 
how to meet or beat C's memcpy with a pure D implementation.

Thanks for all the feedback so far.

Mike