D1: UTF8 char[] casting to wchar[] array cast misalignment ERROR

jicman via Digitalmars-d-learn digitalmars-d-learn at puremagic.com
Mon Jun 16 19:27:41 PDT 2014


Greetings!

I have a bunch of files plain ASCII, UTF8 and UTF16 with and 
without BOM (Byte Order Mark).  I had, "I thought", a nice way of 
figuring out what type of encoding the file was (ASCII, UTF8 or 
UTF16) when the BOM was missing, by reading the content and 
applying the std.utf.validate function to the char[] or, wchar[] 
string.  The problem is that lately, I am hitting into a wall 
with the "array cast misalignment" when casting wchar[].  ie.

auto text = cast(string) file.read();
wchar[] temp = cast(wchar[]) text;

What would be the correct process to find out a text file 
encoding?

Any help would be greatly appreciated.  This is the code that I 
have right now...

//begin code
char[] ReadFileData2UTF8(char[] file, out char[] bom)
{
   auto text = cast(string) file.read();
   if (text.length == 0)
   {
     bom = "NO_BOM";
     return "";
   }
   else if (text.length == 1)
   {
     ubyte[1] b = cast(ubyte[]) text[0 .. 1];
     bom = getBOM(b);
   }
   else if (text.length == 2)
   {
     ubyte[2] b = cast(ubyte[]) text[0 .. 2];
     bom = getBOM(b);
   }
   else if (text.length == 3)
   {
     ubyte[3] b = cast(ubyte[]) text[0 .. 3];
     bom = getBOM(b);
   }
   else if (text.length > 3)
   {
     ubyte[4] b = cast(ubyte[]) text[0 .. 4];
     bom = getBOM(b);
   }
   //writefln(bom);
   if (std.string.find(bom, "UTF16") == 0)
   {
     ubyte[] bs = cast(ubyte[]) text;
     if (bs[0 .. 2] == UTF16_be || bs[0 .. 2] == UTF16_le)
       bs = bs[2 .. $];
     text = cast(char[]) bs;
     wchar[] temp = cast(wchar[]) text; //text[2 .. $];
     text = std.utf.toUTF8(temp);
   }
   else if (std.string.find(bom, "UTF32") == 0)
   {
     ubyte[] bs = cast(ubyte[]) text;
     if (bs[0 .. 4] == UTF32_be || bs[0 .. 4] == UTF32_le)
       bs = bs[4 .. $];
     text = cast(char[]) bs;
     dchar[] temp = cast(dchar[]) text; //text[2 .. $];
     text = std.utf.toUTF8(temp);
   }
   else if (bom == "UTF8")
   {
     ubyte[] bs = cast(ubyte[]) text;
     if (bs[0 .. 3] == UTF8)
       bs = bs[3 .. $];
     text = cast(char[]) bs;
     // text is already UTF8
   }
   else // hopeing I can figure out the type...
   {
     //msgBox("No BOM");
     //ubyte[] bs = cast(ubyte[]) text;
     try // utf8
     {
       validate(text);
       bom = "UTF8";
     }
     catch (UtfException e)
     {
       //msgBox("Failed UTF8. Trying UTF16");
       //text = cast(char[]) bs;
       //if ((text.length % 2) == 1)
       //  text ~= " ";
       try //utf16
       {
         wchar[] temp = cast(wchar[]) text; //text[2 .. $];
         //wchar[] temp = std.utf.toUTF16(text); //text[2 .. $];
         validate(temp);
         text = std.utf.toUTF8(temp);
         bom = "UTF16_le";
       }
       catch (UtfException e)
       {
         //msgBox("Failed UTF16. Trying UTF32");
         //text = cast(char[]) bs;
         try // utf32
         {
           dchar[] temp = cast(dchar[]) text; //text[2 .. $];
           //dchar[] temp = std.utf.toUTF32(text); //text[2 .. $];
           validate(temp);
           text = std.utf.toUTF8(temp);
           bom = "UTF32_le";
         }
         catch (UtfException e) // hoping for ASCII
         {
           //msgBox("Failed UTF32. Hoping ASCII");
           text ~= "\000";
           char[] temp = std.windows.charset.fromMBSz(text.ptr,0);
           text = std.utf.toUTF8(temp);
           //text = temp;
           bom = "NO_BOM";
         }
       }
     }
   }
   return text;
}
//end code




More information about the Digitalmars-d-learn mailing list