Multi-file byte comparison tool. What would you have done differently?

Fri Aug 5 02:02:54 PDT 2011

On Fri, 05 Aug 2011 00:25:38 +0200, Kai Meyer <kai at unixlords.com> wrote:

> I have a need for detecting incorrect byte sequences in multiple files  
> (>2) at a time (as a part of our porting effort to new platforms.)  
> Ideally the files should be identical for all but a handful of byte  
> sequences (in a header section) that I can just skip over. I thought  
> this would be a fun exercise for my D muscles. I found success creating  
> a dynamic array of structs to keep information about each file passed in  
> via command-line parameters. I'll append the code at the end (and I'm  
> sure it'll get mangled in the process...)
>
> (I'm not one for coming up with creative names, so it's SOMETHING) Then  
> I loop around a read for each file, then manually run a for loop from 0  
> to BLOCK_SIZE, copy the size_t value into a new dynamic array (one for  
> each of the files opened), and run a function to ensure all values in  
> the size_t array are the same. If not, I compare each ubyte value (via  
> the byte_union) to determine which bytes are not correct by adding each  
> byte to a separate array, and comparing each value in that array,  
> printing the address and values of each bad byte as I encounter them.
>
> This appears to work great. Some justifications:
> I used size_t because I'm under the impression it's a platform specific  
> size that best fits into a single register, thus making comparisons  
> faster than byte-by-byte.
> I used a union to extract the bytes from the size_t
> I wanted to create a SOMETHING for each file at run-time, instead of  
> only allowing a certain number of SOMETHINGS (either hard coded, or a  
> limit).
> Originally I wrote my own comparison function, but in my search for  
> something more functional, I tried out std.algorithm's count. Can't say  
> I can tell if it's better or worse.
>
> Features I'll probably add if I have to keep using the tool:
> 1) Better support for starting points and bytes to read.
> 2) Threshold for errors encountered, perferrably managed by a  
> command-line argument.
> 3) Coalescing error messages in sequential byte sequences.
>
> When I run the program, it's certainly I/O bound at 30Mb/s to an  
> external USB drive :).
>
> So the question is, how would you make it more D-ish? (Do we have a term  
> analogous to "pythonic" for D? :))
>
>
> Code:
>
> import std.stdio;
> import std.file;
> import std.conv;
> import std.getopt;
> import std.algorithm;
>
> enum BLOCK_SIZE = 1024;
> union byte_union
> {
>      size_t val;
>      ubyte[val.sizeof] bytes;
> }
> struct SOMETHING
> {
>      string file_name;
>      size_t size_bytes;
>      File fd;
>      byte_union[BLOCK_SIZE] bytes;
> }

I would use TypeNames and nonTypeNames, so, blockSize, ByteUnion,  
Something, sizeBytes, etc.

> void main(string[] args)
> {
>      size_t bytes_read;
>      size_t bytes_max;
>      size_t size_smallest;
>      size_t[] comp_arr;
>      SOMETHING[] somethings;

Don't declare variables until you need them, just leave bytes_read and  
bytes_max here.

>      getopt(args,
>          "seek", &bytes_read,
>          "bytes", &bytes_max
>      );
>      if(bytes_max == 0)
>          bytes_max = size_t.max; // Limit on the smallest file size
>      else
>          bytes_max += bytes_read;
>      //bytes_read = bytes_read - (bytes_read % (BLOCK_SIZE *  
> SOMETHING.size_bytes.sizeof));
>      size_smallest = bytes_max;
>      somethings.length = args.length - 1;
>      comp_arr.length = args.length - 1;
>      for(size_t i = 0; i < somethings.length; i++)
>      {
>          somethings[i].file_name = args[i + 1];
>          somethings[i].size_bytes = getSize(somethings[i].file_name);
>          stderr.writef("Opening file: %s(%d)\n",  
> somethings[i].file_name, somethings[i].size_bytes);
>          somethings[i].fd = File(somethings[i].file_name, "r");
>          somethings[i].fd.seek(bytes_read);
>          if(somethings[i].fd.tell() != bytes_read)
>          {
>              stderr.writef("Failed to seek to position %d in %s\n",  
> bytes_read, args[i + 1]);
>          }
>          // Pick the smallest file, or the limit
>          size_smallest = min(size_smallest, somethings[i].size_bytes);
>      }

Use foreach (ref something; somethings) and something instead of  
somethings[i].

>      // Check file sizes
>      for(size_t i = 0; i < somethings.length; i++)
>          comp_arr[i] = somethings[i].size_bytes;
>      writef("count: %s\n", count(comp_arr, comp_arr[0]));
>      if(count(comp_arr, comp_arr[0]) != comp_arr.length)
>      {
>          stderr.writef("Files are not the same size!");
>          foreach(s; somethings)
>              stderr.writef("[%s:%d]", s.file_name, s.size_bytes);
>          stderr.writef("\n");
>      }

You can use writefln() istead of writef("\n") everywhere.

>
>      // While bytes_read < size of smallest file
>      size_t block_counter;
>      while(bytes_read < size_smallest)
>      {
>          // Read bytes
>          //stderr.writef("tell: ");
>          for(size_t i = 0; i < somethings.length; i++)
>          {
>              //stderr.writef("Reading file %s\n", file_names[i]);
>              //stderr.writef("%d ", somethings[i].fd.tell());
>              //if(somethings[0].fd.tell() + BLOCK_SIZE *  
> SOMETHING.size_bytes.sizeof > somethings[0].size_bytes)
>              //{
>              //    stderr.writef("Warning, reading last block :  
> [%d:%d:%d]\n", somethings[0].fd.tell(), somethings[0].size_bytes,  
> somethings[0].fd.tell() + BLOCK_SIZE * SOMETHING.size_bytes.sizeof);
>              //    for(size_t j = 0; j < somethings[i].bytes.length; j++)
>              //    {
>              //        somethings[i].bytes[i].val = 0;
>              //    }
>              //}
>              somethings[i].fd.rawRead(somethings[i].bytes);
>          }
>          // Compare all size_t values
>          for(size_t i = 0; i < BLOCK_SIZE; i++)
>          {

Here you can use foreach (i; 0 .. blockSize)

>              // If one is different
>              for(size_t j = 0; j < somethings.length; j++)
>                  comp_arr[j] = somethings[j].bytes[i].val;
>              if(count(comp_arr, comp_arr[0]) != comp_arr.length)
>              {
>                  // Compare bytes inside to determine which byte(s) are  
> different
>                  for(size_t k = 0; k < byte_union.sizeof; k++)
>                  {
>                      for(size_t j = 0; j < somethings.length; j++)
>                          comp_arr[j] =  
> to!(size_t)(somethings[j].bytes[i].bytes[k]);
>                      if(count(comp_arr, comp_arr[0]) != comp_arr.length)
>                      {
>                          stderr.writef("Byte at 0x%08x (%u) does not  
> match %s\n",
>                              bytes_read + i * byte_union.sizeof + k,
>                              bytes_read + i * byte_union.sizeof + k,  
> comp_arr);
>                      }
>                  }
>              }
>          }
>          bytes_read += BLOCK_SIZE * SOMETHING.size_bytes.sizeof;
>          block_counter++;
>          if( (block_counter % (1024 * 25)) == 0)
>          {
>              stderr.writef("Completed %5.1fGB\n",  
> to!(double)(bytes_read) / 1024 / 1024 / 1024);
>          }
>      }
>
>      for(size_t i = 0; i < somethings.length; i++)
>      {
>          somethings[i].fd.close();
>      }
> }

You don't generally need to close them, they should be closed by the  
destructors of the File (I think, at least).

I don't understand why you use ByteUnion instead of just a plain array of  
bytes. I also don't understand why you write so much to stderr instead of  
stdout.