Wyverex wyverex.cypher at
Thu Aug 14 18:58:50 PDT 2008

Was messing around with SIMD, SSE stuff..  didn't know how much faster 
it could be!  Its been a few years since I did any assembly.
Though Id just share this, any word of adding this to the lib or 
compiler optimizations for this?

my results:
Parallel	 Single

SQRTPS:0.000120  FSQRT:0.001021
SQRTPS:0.000114  FSQRT:0.001026
SQRTPS:0.000114  FSQRT:0.001021
SQRTPS:0.000114  FSQRT:0.001026

codepad if you wish to play with it..

...times from codepad
SQRTPS:0.000291  FSQRT:0.000634
SQRTPS:0.000289  FSQRT:0.000632
SQRTPS:0.000300  FSQRT:0.000642
SQRTPS:0.000291  FSQRT:0.000632

//used these sites as a resource..

import std.stdio : putr = writefln;
import tango.time.StopWatch;

void fastsqrt( float[] a, float[] b )
   if(a.length != b.length || a.length % 4 != 0)
     throw new Exception("fsqrt bad params!");

   float* pa = a.ptr, pb = b.ptr;
   uint times = a.length>>2;

     mov ECX, times;
     mov EAX, [pa];
     mov EBX, [pb];

     movups XMM0, [EAX];
     sqrtps XMM0, XMM0;
     movups [EBX], XMM0;
     add EAX, 16;
     add EBX, 16;
     loop REP;

void sqrt( float[] a, float[] b )
   if(a.length != b.length)
     throw new Exception("fsqrt bad params!");

   float* pa = a.ptr, pb = b.ptr;
   uint times = a.length;

     mov EAX, [pa];
     mov EBX, [pb];
     mov ECX, times; //error on a.length

     fldpi float ptr[EAX];
     fstp float ptr[EBX];
     add EAX, 4;
     add EBX, 4;
     loop REP2;

void main()
   float[40_000] a, b, c;

   foreach( k, ref i; a )
     i = cast(float)k;

   double A, B;
   StopWatch timer;

    fastsqrt( a, b );
   A = timer.stop;

    sqrt( a, c );
   B = timer.stop;

  //putr(a, "\n", b, "\n", c);

  foreach(k, i; b)
     assert( b[k] == c[k] );

  putr("SQRTPS:%.6f  FSQRT:%.6f", A, B);

More information about the Digitalmars-d-learn mailing list