String compare performance

Sat Nov 27 21:36:58 PST 2010

On Sat, 27 Nov 2010 22:08:29 -0500, bearophile <bearophileHUGS at lycos.com>  
wrote:
> I have done another test:
>
> Timings, dmd compiler, best of 4, seconds:
>   D #1: 5.72
>   D #4: 1.84
>   D #5: 1.73
>   Psy:  1.59
>   D #2: 0.55
>   D #6: 0.47
>   D #3: 0.34
>
>
> import std.file: read;
> import std.c.stdio: printf;
>
> int test(char[] data) {
>     int count;
>     foreach (i; 0 ..  data.length - 3) {
>         char[] codon = data[i .. i + 3];
>         if ((codon.length == 3 && codon[0] == 'T' && codon[1] == 'A' &&  
> codon[2] == 'G') ||
>             (codon.length == 3 && codon[0] == 'T' && codon[1] == 'G' &&  
> codon[2] == 'A') ||
>             (codon.length == 3 && codon[0] == 'T' && codon[1] == 'A' &&  
> codon[2] == 'A'))
>             count++;
>     }
>     return count;
> }
>
> void main() {
>     char[] data0 = cast(char[])read("data.txt");
>     int n = 300;
>     char[] data = new char[data0.length * n];
>     for (size_t pos; pos < data.length; pos += data0.length)
>         data[pos .. pos+data0.length] = data0;
>
>     printf("%d\n", test(data));
> }
>
>
> So when there is to compare among strings known at compile-time to be  
> small (like < 6 char), the comparison shall be replaced with inlined  
> single char comparisons. This makes the code longer so it increases code  
> cache pressure, but seeing how much slow the alternative is, I think  
> it's an improvement.
>
> (A smart compiler is even able to remove the codon.length==3 test  
> because the slice data[i..i+3] is always of length 3 (here mysteriously  
> if you remove those three length tests the program compiled with dmd  
> gets slower)).
>
> Bye,
> bearophile

Hi bearophile,
I've spent some time having fun this afternoon optimizing array-equals  
using vectorization techniques. I found that vectorizing using ulongs  
worked best on my system except with the shortest strings, where a simple  
Duff's device edged it out. If you'd like to try it out on your data set:

bool arrayComp3(bool useBitCompare = true,T)(const T[] a, const T[] b)  
pure nothrow  {
     if(a.length != b.length) return false;

     static if(useBitCompare) {
         auto pab = cast(ubyte*)a.ptr;
         auto pbb = cast(ubyte*)b.ptr;
         if(pab is pbb) return true;

         auto byte_length = a.length*T.sizeof;
         auto pa_end      = cast(ulong*)(pab+byte_length);
	    final switch (byte_length % ulong.sizeof) {
             case 7:             if(*pab++ != *pbb++ ) return false;
             case 6:             if(*pab++ != *pbb++ ) return false;
             case 5:             if(*pab++ != *pbb++ ) return false;
             case 4:             if(*pab++ != *pbb++ ) return false;
             case 3:             if(*pab++ != *pbb++ ) return false;
             case 2:             if(*pab++ != *pbb++ ) return false;
             case 1:             if(*pab++ != *pbb++ ) return false;
             case 0:
	    }
         auto pa      = cast(ulong*)pab;
         auto pb      = cast(ulong*)pbb;
         while (pa < pa_end) {
             if(*pa++ != *pb++ ) return false;
         }
     } else {                                            // default to a  
short duff's device
         auto pa = a.ptr;
         auto pb = b.ptr;
         if(pa is pb) return true;
	    auto n  = (a.length + 3) / 4;
	    final switch (a.length % 4) {
             case 0:        do {  if(*pa++ != *pb++ ) return false;
             case 3:              if(*pa++ != *pb++ ) return false;
             case 2:              if(*pa++ != *pb++ ) return false;
             case 1:              if(*pa++ != *pb++ ) return false;
                       } while (--n > 0);
	    }    }
     return true;
}