Speed of csvReader

Thu Jan 21 13:24:49 PST 2016

On Thu, Jan 21, 2016 at 07:11:05PM +0000, Jesse Phillips via Digitalmars-d-learn wrote:
> On Thursday, 21 January 2016 at 09:39:30 UTC, data pulverizer wrote:
> >R takes about half as long to read the file. Both read the data in
> >the "equivalent" type format. Am I doing something incorrect here?
> 
> CsvReader hasn't been compared and optimized from other CSV readers.
> It does have allocation for the parsed string (even if it isn't
> changed) and it does a number of validation checks.
[...]

This piqued my interest today, so I decided to take a shot at writing a
fast CSV parser.  First, I downloaded a sample large CSV file from:

	ftp://ftp.census.gov/econ2013/CBP_CSV/cbp13co.zip

This file has over 2 million records, so I thought it would serve as a
good dataset to run benchmarks on.

Since the OP wanted the loaded data in an array of records, as opposed
iterating over the records as an input range, I decided that the best
way to optimize this use case was to load the entire file into memory
and then return an array of slices to this data, instead of wasting time
(and memory) copying the data.

Furthermore, since it will be an array of records which are arrays of
slices to field values, another optimization is to allocate a large
buffer for storing consecutive field slices, and then in the outer array
just slice the buffer to represent a record. This greatly cuts down on
the number of GC allocations needed.

Once the buffer is full, we don't allocate a larger buffer and copy
everything over; this is unnecessary (and wasteful) because the outer
array doesn't care where its elements point to. Instead, we allocate a
new buffer, leaving previous records pointing to slices of the old
buffer, and start appending more field slices in the new buffer, and so
on. After all, the records don't have to exist in consecutive slices.
There's just a minor overhead in that if we run out of space in the
buffer while in the middle of parsing a record, we need to copy the
current record's field slices into the new buffer, so that all the
fields belonging to this record remain contiguous (so that the outer
array can just slice them). This is a very small overhead compared to
copying the entire buffer into a new memory block (as would happen if we
kept the buffer as a single array that needs to expand), so it ought to
be negligible.

So in a nutshell, what we have is an outer array, each element of which
is a slice (representing a record) that points to some slice of one of
the buffers. Each buffer is a contiguous sequence of slices
(representing a field) pointing to some segment of the original data.

Here's the code:

---------------------------------------------------------------------------
	/**
	 * Experimental fast CSV reader.
	 *
	 * Based on RFC 4180.
	 */
	module fastcsv;

	/**
	 * Reads CSV data from the given filename.
	 */
	auto csvFromUtf8File(string filename)
	{
	    import std.file : read;
	    return csvFromString(cast(string) read(filename));
	}

	/**
	 * Parses CSV data in a string.
	 *
	 * Params:
	 *  fieldDelim = The field delimiter (default: ',')
	 *  data = The data in CSV format.
	 */
	auto csvFromString(dchar fieldDelim=',', dchar quote='"')(const(char)[] data)
	{
	    import core.memory;
	    import std.array : appender;

	    enum fieldBlockSize = 1 << 16;
	    auto fields = new const(char)[][fieldBlockSize];
	    size_t curField = 0;

	    GC.disable();
	    auto app = appender!(const(char)[][][]);

	    // Scan data
	    size_t i;
	    while (i < data.length)
	    {
	        // Parse records
	        size_t firstField = curField;
	        while (i < data.length && data[i] != '\n' && data[i] != '\r')
	        {
	            // Parse fields
	            size_t firstChar, lastChar;
	            if (data[i] == quote)
	            {
	                i++;
	                firstChar = i;
	                while (i < data.length && data[i] != fieldDelim &&
	                       data[i] != '\n' && data[i] != '\r')
	                {
	                    i++;
	                }
	                lastChar = (i < data.length && data[i-1] == quote) ? i-1 : i;
	            }
	            else
	            {
	                firstChar = i;
	                while (i < data.length && data[i] != fieldDelim &&
	                       data[i] != '\n' && data[i] != '\r')
	                {
	                    i++;
	                }
	                lastChar = i;
	            }
	            if (curField >= fields.length)
	            {
	                // Fields block is full; copy current record fields into new
	                // block so that they are contiguous.
	                auto nextFields = new const(char)[][fieldBlockSize];
	                nextFields[0 .. curField - firstField] =
	                    fields[firstField .. curField];

	                //fields.length = firstField; // release unused memory?

	                curField = curField - firstField;
	                firstField = 0;
	                fields = nextFields;
	            }
	            assert(curField < fields.length);
	            fields[curField++] = data[firstChar .. lastChar];

	            // Skip over field delimiter
	            if (i < data.length && data[i] == fieldDelim)
	                i++;
	        }
	        app.put(fields[firstField .. curField]);

	        // Skip over record delimiter(s)
	        while (i < data.length && (data[i] == '\n' || data[i] == '\r'))
	            i++;
	    }

	    GC.collect();
	    GC.enable();
	    return app.data;
	}

	unittest
	{
	    auto sampleData =
	        `123,abc,"mno pqr",0` ~ "\n" ~
	        `456,def,"stuv wx",1` ~ "\n" ~
	        `78,ghijk,"yx",2`;

	    auto parsed = csvFromString(sampleData);
	    assert(parsed == [
	        [ "123", "abc", "mno pqr", "0" ],
	        [ "456", "def", "stuv wx", "1" ],
	        [ "78", "ghijk", "yx", "2" ]
	    ]);
	}

	unittest
	{
	    auto dosData =
	        `123,aa,bb,cc` ~ "\r\n" ~
	        `456,dd,ee,ff` ~ "\r\n" ~
	        `789,gg,hh,ii` ~ "\r\n";

	    auto parsed = csvFromString(dosData);
	    assert(parsed == [
	        [ "123", "aa", "bb", "cc" ],
	        [ "456", "dd", "ee", "ff" ],
	        [ "789", "gg", "hh", "ii" ]
	    ]);
	}
---------------------------------------------------------------------------

There are some limitations to this approach: while the current code does
try to unwrap quoted values in the CSV, it does not correctly parse
escaped double quotes ("") in the fields. This is because to process
those values correctly we'd have to copy the field data into a new
string and construct its interpreted value, which is slow.  So I leave
it as an exercise for the reader to implement (it's not hard, when the
double double-quote sequence is detected, allocate a new string with the
interpreted data instead of slicing the original data. Either that, or
just unescape the quotes in the application code itself).

Now, in the first version of this code, I didn't have the GC calls...
those were added later when I discovered that the GC was slowing it down
to about the same speed (or worse!) as std.csv. A little profiling
showed that 80% of the time was spent in the GC mark/collect code. After
adding in the code to disable the GC, the performance improved
dramatically.

Of course, running without GC collection is not a fair comparison with
std.csv, so I added an option to my benchmark program to disable the GC
for std.csv as well.  While the result was slightly faster, it was still
much slower than my fastcsv code. (Though to be fair, std.csv does
perform validation checks and so forth that fastcsv doesn't even try
to.)

Anyway, here are the performance numbers from one of the benchmark runs
(these numbers are pretty typical):

	std.csv (with gc): 2126884 records in 23144 msecs
	std.csv (no gc): 2126884 records in 18109 msecs
	fastcsv (no gc): 2126884 records in 1358 msecs

As you can see, our little array-slicing scheme gives us a huge
performance boost over the more generic std.csv range-based code. We
managed to cut out over 90% of the total runtime, even when std.csv is
run with GC disabled. We even try to be nice in fastcsv by calling
GC.collect to cleanup after we're done, and this collection time is
included in the benchmark.

While this is no fancy range-based code, and one might say it's more
hackish and C-like than idiomatic D, the problem is that current D
compilers can't quite optimize range-based code to this extent yet.
Perhaps in the future optimizers will improve so that more idiomiatic,
range-based code will have comparable performance with fastcsv. (At
least in theory this should be possible.)

Finally, just for the record, here's the benchmark code I used:

---------------------------------------------------------------------------
	/**
	 * Crude benchmark for fastcsv.
	 */
	import core.memory;
	import std.array;
	import std.csv;
	import std.file;
	import std.datetime;
	import std.stdio;

	import fastcsv;

	int main(string[] argv)
	{
	    if (argv.length < 2)
	    {
	        stderr.writeln("Specify std, stdnogc, or fast");
	        return 1;
	    }

	    // Obtained from ftp://ftp.census.gov/econ2013/CBP_CSV/cbp13co.zip
	    enum csvFile = "ext/cbp13co.txt";

	    string input = cast(string) read(csvFile);

	    if (argv[1] == "std")
	    {
	        auto result = benchmark!({
	            auto data = std.csv.csvReader(input).array;
	            writefln("std.csv read %d records", data.length);
	        })(1);
	        writefln("std.csv: %s msecs", result[0].msecs);
	    }
	    else if (argv[1] == "stdnogc")
	    {
	        auto result = benchmark!({
	            GC.disable();
	            auto data = std.csv.csvReader(input).array;
	            writefln("std.csv (nogc) read %d records", data.length);
	            GC.enable();
	        })(1);
	        writefln("std.csv: %s msecs", result[0].msecs);
	    }
	    else if (argv[1] == "fast")
	    {
	        auto result = benchmark!({
	            auto data = fastcsv.csvFromString(input);
	            writefln("fastcsv read %d records", data.length);
	        })(1);
	        writefln("fastcsv: %s msecs", result[0].msecs);
	    }
	    else
	    {
	        stderr.writeln("Unknown option: " ~ argv[1]);
	        return 1;
	    }
	    return 0;
	}
---------------------------------------------------------------------------

--T