String compare performance

Sun Nov 28 03:44:23 PST 2010

Robert Jacques:

> I've spent some time having fun this afternoon optimizing array-equals  
> using vectorization techniques. I found that vectorizing using ulongs  
> worked best on my system except with the shortest strings, where a simple  
> Duff's device edged it out. If you'd like to try it out on your data set:

Thank you for your work :-)
A version with your function, D version #8:

// D version #8
import std.file: read;
import std.c.stdio: printf;
import std.exception: assumeUnique;

bool arrayComp(bool useBitCompare=true, T)
              (const T[] a, const T[] b) pure nothrow {
    if (a.length != b.length)
        return false;

    static if (useBitCompare) {
        auto pab = cast(ubyte*)a.ptr;
        auto pbb = cast(ubyte*)b.ptr;
        if (pab is pbb)
            return true;

        auto byte_length = a.length * T.sizeof;
        auto pa_end = cast(ulong*)(pab + byte_length);

        final switch (byte_length % ulong.sizeof) {
            case 7: if (*pab++ != *pbb++) return false;
            case 6: if (*pab++ != *pbb++) return false;
            case 5: if (*pab++ != *pbb++) return false;
            case 4: if (*pab++ != *pbb++) return false;
            case 3: if (*pab++ != *pbb++) return false;
            case 2: if (*pab++ != *pbb++) return false;
            case 1: if (*pab++ != *pbb++) return false;
            case 0:
        }

        auto pa = cast(ulong*)pab;
        auto pb = cast(ulong*)pbb;

        while (pa < pa_end) {
            if (*pa++ != *pb++)
                return false;
        }
    } else { // default to a short duff's device
        auto pa = a.ptr;
        auto pb = b.ptr;
        if (pa == pb)
            return true;
        auto n  = (a.length + 3) / 4;

        final switch (a.length % 4) {
            case 0: do { if (*pa++ != *pb++) return false;
            case 3:      if (*pa++ != *pb++) return false;
            case 2:      if (*pa++ != *pb++) return false;
            case 1:      if (*pa++ != *pb++) return false;
                       } while (--n > 0);
        }
    }

    return true;
}

int test(string data) {
    int count;
    foreach (i; 0 ..  data.length - 3) {
        auto codon = data[i .. i + 3];
        if (arrayComp(codon, "TAG") || arrayComp(codon, "TGA") || arrayComp(codon, "TAA"))
            count++;
    }
    return count;
}

void main() {
    char[] data0 = cast(char[])read("data.txt");
    int n = 300;
    char[] data = new char[data0.length * n];
    for (size_t pos; pos < data.length; pos += data0.length)
        data[pos .. pos+data0.length] = data0;
    string sdata = assumeUnique(data);

    printf("%d\n", test(sdata));
}

Timings, dmd compiler, best of 4, seconds:
  D #1: 5.72
  D #4: 1.84
  D #5: 1.73
  Psy:  1.59
  D #8: 1.51
  D #7: 0.56 (like #6 without length comparisons)
  D #2: 0.55
  D #6: 0.47  
  D #3: 0.34

Your function can't be inlined because it's big, so this code isn't faster than inlined code like this generated by the compiler:
(codon.length == 3 && codon[0] == 'T' && codon[1] == 'A' && codon[2] == 'G')

Bye,
bearophile