Implementing Half Floats in D

Tue Jan 29 04:31:01 PST 2013

On Monday, 28 January 2013 at 23:11:11 UTC, Walter Bright wrote:
> http://www.drdobbs.com/cpp/implementing-half-floats-in-d/240146674


Since it got lost in the old thread on this topic, I'll repost my 
versions of floatToshort and shortToFloat, which are extremely 
fast (no unpredictable branches, no lookup tables) and respect 
the current rounding mode:

-----------------

float shortToFloat(ushort s)
{
     // note this is a signed shift, so sign bit gets smeared all 
the way into the int bit!
     uint u = ((cast(int)cast(short)s) << 13);

     if ( (s & EXPMASK) == 0 )
     {   // Subnormal or 0.
         // The simple conversion is wrong in two ways:
         // (1) it's added an implicit bit. This has value 0x1p-15.
         // (2) the mantissa bits got shifted along to make space 
for the hidden bit.
         //     So we need to multiply the result by 2.
         // Note that 0x3880_0000 means 0x1p-14.

         uint v = (u & 0x0FFF_FFFF ) + 0x3880_0000;
         float f = *cast(float *)&v - 0x1p-14;
         u = (u & 0x8000_0000) | *cast(uint *)&f;
         return *cast(float *)&u;
     }

     u = (u & 0x8FFF_FFFF) + 0x3800_0000;

     if ( (s & EXPMASK) == EXPMASK )
     {   // infinity or NaN
         u |= FEXPMASK;
     }
     return *cast(float *)&u;
}


-----------------
NOTE: this only works on 64-bit runtime, for 32bit or CTFE with 
80-bit intermediates, the constants need to be changed. 
Unfortunately I don't know of a nice way to detect the size of 
the intermediates.
-----

ushort floatToShort(float f)
{
     // Remember the sign
     uint x = *cast(uint *)&f;

     ushort sgn = (x >> 16) & 0x8000;

     // Need to treat NaN and Inf specially, otherwise the
     // mantissa shortening step would generate a new NaN.
     if ( (x & FEXPMASK) == FEXPMASK)
         return ( (x >> 13) & 0x7FFF) | sgn;

     // Shorten the mantissa, rounding it according to the current 
rounding mode

     f = (f * (1.0f + 0x1p-13f) -f) * 0x1p13f;

     // Force large numbers to overflow my moving near float.max

     f *= 0x1p112f;
     f *= 0x1p-112f; // Then undo it

     // Force small numbers to underflow, and shift into position

     f *= 0x1p-112f;

     uint u = *cast(uint *)&f;

     return ((u>>13) & 0x7FFF) | sgn;
}