Multi-file byte comparison tool. What would you have done differently?
Pelle
pelle.mansson at gmail.com
Fri Aug 5 02:02:54 PDT 2011
On Fri, 05 Aug 2011 00:25:38 +0200, Kai Meyer <kai at unixlords.com> wrote:
> I have a need for detecting incorrect byte sequences in multiple files
> (>2) at a time (as a part of our porting effort to new platforms.)
> Ideally the files should be identical for all but a handful of byte
> sequences (in a header section) that I can just skip over. I thought
> this would be a fun exercise for my D muscles. I found success creating
> a dynamic array of structs to keep information about each file passed in
> via command-line parameters. I'll append the code at the end (and I'm
> sure it'll get mangled in the process...)
>
> (I'm not one for coming up with creative names, so it's SOMETHING) Then
> I loop around a read for each file, then manually run a for loop from 0
> to BLOCK_SIZE, copy the size_t value into a new dynamic array (one for
> each of the files opened), and run a function to ensure all values in
> the size_t array are the same. If not, I compare each ubyte value (via
> the byte_union) to determine which bytes are not correct by adding each
> byte to a separate array, and comparing each value in that array,
> printing the address and values of each bad byte as I encounter them.
>
> This appears to work great. Some justifications:
> I used size_t because I'm under the impression it's a platform specific
> size that best fits into a single register, thus making comparisons
> faster than byte-by-byte.
> I used a union to extract the bytes from the size_t
> I wanted to create a SOMETHING for each file at run-time, instead of
> only allowing a certain number of SOMETHINGS (either hard coded, or a
> limit).
> Originally I wrote my own comparison function, but in my search for
> something more functional, I tried out std.algorithm's count. Can't say
> I can tell if it's better or worse.
>
> Features I'll probably add if I have to keep using the tool:
> 1) Better support for starting points and bytes to read.
> 2) Threshold for errors encountered, perferrably managed by a
> command-line argument.
> 3) Coalescing error messages in sequential byte sequences.
>
> When I run the program, it's certainly I/O bound at 30Mb/s to an
> external USB drive :).
>
> So the question is, how would you make it more D-ish? (Do we have a term
> analogous to "pythonic" for D? :))
>
>
> Code:
>
> import std.stdio;
> import std.file;
> import std.conv;
> import std.getopt;
> import std.algorithm;
>
> enum BLOCK_SIZE = 1024;
> union byte_union
> {
> size_t val;
> ubyte[val.sizeof] bytes;
> }
> struct SOMETHING
> {
> string file_name;
> size_t size_bytes;
> File fd;
> byte_union[BLOCK_SIZE] bytes;
> }
I would use TypeNames and nonTypeNames, so, blockSize, ByteUnion,
Something, sizeBytes, etc.
> void main(string[] args)
> {
> size_t bytes_read;
> size_t bytes_max;
> size_t size_smallest;
> size_t[] comp_arr;
> SOMETHING[] somethings;
Don't declare variables until you need them, just leave bytes_read and
bytes_max here.
> getopt(args,
> "seek", &bytes_read,
> "bytes", &bytes_max
> );
> if(bytes_max == 0)
> bytes_max = size_t.max; // Limit on the smallest file size
> else
> bytes_max += bytes_read;
> //bytes_read = bytes_read - (bytes_read % (BLOCK_SIZE *
> SOMETHING.size_bytes.sizeof));
> size_smallest = bytes_max;
> somethings.length = args.length - 1;
> comp_arr.length = args.length - 1;
> for(size_t i = 0; i < somethings.length; i++)
> {
> somethings[i].file_name = args[i + 1];
> somethings[i].size_bytes = getSize(somethings[i].file_name);
> stderr.writef("Opening file: %s(%d)\n",
> somethings[i].file_name, somethings[i].size_bytes);
> somethings[i].fd = File(somethings[i].file_name, "r");
> somethings[i].fd.seek(bytes_read);
> if(somethings[i].fd.tell() != bytes_read)
> {
> stderr.writef("Failed to seek to position %d in %s\n",
> bytes_read, args[i + 1]);
> }
> // Pick the smallest file, or the limit
> size_smallest = min(size_smallest, somethings[i].size_bytes);
> }
Use foreach (ref something; somethings) and something instead of
somethings[i].
> // Check file sizes
> for(size_t i = 0; i < somethings.length; i++)
> comp_arr[i] = somethings[i].size_bytes;
> writef("count: %s\n", count(comp_arr, comp_arr[0]));
> if(count(comp_arr, comp_arr[0]) != comp_arr.length)
> {
> stderr.writef("Files are not the same size!");
> foreach(s; somethings)
> stderr.writef("[%s:%d]", s.file_name, s.size_bytes);
> stderr.writef("\n");
> }
You can use writefln() istead of writef("\n") everywhere.
>
> // While bytes_read < size of smallest file
> size_t block_counter;
> while(bytes_read < size_smallest)
> {
> // Read bytes
> //stderr.writef("tell: ");
> for(size_t i = 0; i < somethings.length; i++)
> {
> //stderr.writef("Reading file %s\n", file_names[i]);
> //stderr.writef("%d ", somethings[i].fd.tell());
> //if(somethings[0].fd.tell() + BLOCK_SIZE *
> SOMETHING.size_bytes.sizeof > somethings[0].size_bytes)
> //{
> // stderr.writef("Warning, reading last block :
> [%d:%d:%d]\n", somethings[0].fd.tell(), somethings[0].size_bytes,
> somethings[0].fd.tell() + BLOCK_SIZE * SOMETHING.size_bytes.sizeof);
> // for(size_t j = 0; j < somethings[i].bytes.length; j++)
> // {
> // somethings[i].bytes[i].val = 0;
> // }
> //}
> somethings[i].fd.rawRead(somethings[i].bytes);
> }
> // Compare all size_t values
> for(size_t i = 0; i < BLOCK_SIZE; i++)
> {
Here you can use foreach (i; 0 .. blockSize)
> // If one is different
> for(size_t j = 0; j < somethings.length; j++)
> comp_arr[j] = somethings[j].bytes[i].val;
> if(count(comp_arr, comp_arr[0]) != comp_arr.length)
> {
> // Compare bytes inside to determine which byte(s) are
> different
> for(size_t k = 0; k < byte_union.sizeof; k++)
> {
> for(size_t j = 0; j < somethings.length; j++)
> comp_arr[j] =
> to!(size_t)(somethings[j].bytes[i].bytes[k]);
> if(count(comp_arr, comp_arr[0]) != comp_arr.length)
> {
> stderr.writef("Byte at 0x%08x (%u) does not
> match %s\n",
> bytes_read + i * byte_union.sizeof + k,
> bytes_read + i * byte_union.sizeof + k,
> comp_arr);
> }
> }
> }
> }
> bytes_read += BLOCK_SIZE * SOMETHING.size_bytes.sizeof;
> block_counter++;
> if( (block_counter % (1024 * 25)) == 0)
> {
> stderr.writef("Completed %5.1fGB\n",
> to!(double)(bytes_read) / 1024 / 1024 / 1024);
> }
> }
>
> for(size_t i = 0; i < somethings.length; i++)
> {
> somethings[i].fd.close();
> }
> }
You don't generally need to close them, they should be closed by the
destructors of the File (I think, at least).
I don't understand why you use ByteUnion instead of just a plain array of
bytes. I also don't understand why you write so much to stderr instead of
stdout.
More information about the Digitalmars-d-learn
mailing list