Emulate 64-bit mulh instruction

Wed Mar 13 16:06:34 UTC 2019

Apparently this has no intrinsic, so wrote this code for x86 to 
compute 128 bit product:

ulong[2] mul(ulong a, ulong b)
{
     import ldc.intrinsics;
     ulong a1=cast(uint)a, a2=a>>32;
     ulong b1=cast(uint)b, b2=b>>32;
     ulong c1=a1*b1; //0+64
     ulong c2=a1*b2; //32+64
     ulong c3=a2*b1; //32+64
     ulong c4=a2*b2; //64+64
     auto d1o=llvm_uadd_with_overflow(c1,c2<<32);
     ulong d1=d1o.result;
     c4+=d1o.overflow;
     auto d2o=llvm_uadd_with_overflow(d1,c3<<32);
     ulong d2=d2o.result;
     c4+=d2o.overflow;
     //ulong d1=c1+(c2<<32);
     //ulong d2=d1+(c3<<32);
     ulong d3=c4+(c2>>32);
     ulong d4=d3+(c3>>32);
     return [d4,d2];
}

but the compiler doesn't recognize it as multiplication and 
doesn't generate single imul instruction. Is the code wrong or 
the compiler can't recognize it?