some regex vs std.ascii vs handcode times

Sun Mar 18 23:10:30 PDT 2012

please put the test-files and testcode (which seem to fit in one file)
somewhere so others can check your timings

Am 19.03.2012 05:12, schrieb Jay Norwood:
> I'm timing operations processing 10 2MB text files in parallel.
> I haven't gotten to the part where I put the words in the map,
> but I've done enough through this point to say a few things about
> the measurements.
>
> This first one took 1ms, which tells me that the taskPool tasks
> are doing a good job, and this will not be a problem.
>
> // do nothing
> //finished! time: 1 ms
> void wcp_nothing(string fn)
> {
> 	//G:\d\a7\a7\Release>a7
>
> }
>
>
> This second one was just to read in the files to get a baseline.
> One surpise, seen later is that the byChunk actually completed
> faster by several ms.
>
> // read files
> //finished! time: 31 ms
> void wcp_whole_file(string fn)
> {
> 	auto input = std.file.read(fn);
> }
>
>
> On the other end of the spectrum is the byLine version of the
> read.  So this is way too slow to be promoting in our examples,
> and if anyone is using this in the code you should instead read
> chunks ... maybe 1MB like in my example later below, and then
> split up the lines yourself.
>
> // read files by line ... yikes! don't want to do this
> //finished! time: 485 ms
> void wcp_byLine(string fn)
> {
> 	auto f = File(fn);
> 	foreach(line; f.byLine(std.string.KeepTerminator.yes)){
> 	}
> }
>
>
> Ok, this was the good surprise.  Reading by chunks was faster
> than reading the whole file, by several ms.
>
> // read files by chunk ...!better than full input
> //finished! time: 23 ms
> void wcp_files_by_chunk(string fn)
> {
> 	auto f = File(fn);
> 	foreach(chunk; f.byChunk(1_000_000)){
> 	}
> }
>
>
> So this is doing the line count while reading chunks.  It is back
> to the same time as the read of full input.  It is ok though
> since it would avoid issues with reading in a really big file.
> So the final thing will probably use this rather than the full
> input read version.  But for now I'm going to read the full input
> for the other tests.
>
> // read lc by chunk ...same as full input
> //finished! time: 34 ms
> void wcp_lc_bychunk (string fn)
> {
> 	ulong l_cnt;
> 	auto f = File(fn);
> 	foreach(chunk; f.byChunk(1_000_000)){
> 		foreach(c;chunk){
> 			if (c=='\n')
> 				l_cnt++;
> 		}
> 	}
> }
>
> There doesn't appear to be any significant overhead to reading
> dchar vs char.  I haven't looked at the code (or decode)
> underneath.  I presume it is decoding and expanding into dchar...
>
> // read dchar lc by chunk ...same
> //finished! time: 34 ms
> void wcp_dchar(string fn)
> {
> 	ulong l_cnt;
> 	auto f = File(fn);
> 	foreach(chunk; f.byChunk(1_000_000)){
> 		foreach(dchar c;chunk){
> 			if (c=='\n')
> 				l_cnt++;
> 		}
> 	}
> }
>
> So here is some surprise ... why is regex 136ms vs 34 ms hand
> code?
>
> // count lines with regex
> //finished! time: 136 ms
> void wcp_lines_with_regex(string fn)
> {
> 	string input = cast(string)std.file.read(fn);
> 	auto rx = regex("\n");
> 	ulong l_cnt;
> 	foreach(e; splitter(input, rx))
> 	{
> 		l_cnt ++;
> 	}
> }
>
> So this is faster than the non-compiled regex, but still slower
> than hand code, which was in the 35 ms range.  Also, the
> documentation implies that the "g" flag should have read in all
> the matches at once, but when I tested for that, the length from
> the result of a single call to match was always 1.  So I think it
> must be broken.
>
>
> // count lines with compiled ctRegex matcher
> //finished! time: 97 ms
> void wcp_ctRegex(string fn)
> {
> 	string input = cast(string)std.file.read(fn);
> 	enum ctr =  ctRegex!("\n","g");
> 	ulong l_cnt;
> 	foreach(m; match(input,ctr))
> 	{
> 		l_cnt ++;
> 	}
> }
>
>
> //count lines with char match, this or the one with chunks about
> the same
> //finished! time: 34 ms
> void wcp_char(string fn)
> {
> 	string input = cast(string)std.file.read(fn);
> 	ulong l_cnt;
> 	foreach(c; input)
> 	{
> 		if (c == '\n')
> 		l_cnt ++;
> 	}
> }
>
> This is the fastest of the word finders, maybe by 40ms, which is
> meaningful if you project to larger tasks.  This is similar to
> the code used in the u++ benchmark processing of  alice.txt.
> Never mind the line counts for now.  I'm just wanting to measure
> the word count impact separately.
>
> //find words using pointers
> //finished! time: 110 ms
> void wcp_pointer(string fn)
> {
> 	string input = cast(string)std.file.read(fn);
> 	ulong w_cnt;
> 	char c;
> 	auto p = input.ptr;
> 	auto pe = p+input.length;
> 	while(p<pe)
> 	{
> 		c = *p;
>
> 		if (c>= 'a'&&  c<= 'z' ||
> 			c>= 'A'&&  c<= 'Z')
> 		{
> 			++w_cnt;
> 			auto st = p++;
> 			while (p<pe){
> 				c = *p;
> 				if 	(!(c>= 'a'&&  c<= 'z' ||
> 					 c>= 'A'&&  c<= 'Z' ||
> 					 c>= '0'&&  c<= '9'))
> 				{
> 					break;
> 				}
> 				p++;
> 			}
> 			auto wpend = p;
> 		}
> 		p++;
> 	}
> }
>
> Ok, this is way too slow with both ctRegex and with regex, and so
> again I think there is something broken with the greedy flag
> since moving it outside the foreach only returned a length of 1
> in both cases.
>
> // count words with compiled ctRegex matcher !way too slow
> //finished! time: 1299 ms for ctRegex
> //finished! time: 2608 ms for regex
> void wcp_words_ctRegex
> (string fn)
> {
> 	string input = cast(string)std.file.read(fn);
> 	enum ctr =  regex("[a-zA-Z][a-zA-Z0-9]*","g");
> 	ulong w_cnt;
> 	foreach(m; match(input,ctr))
> 	{
> 		//auto s = m.hit;
> 		w_cnt ++;
> 	}
> }
>
> Using slices is slower than the pointer version previously by
> about 40ms.  I turned off the range checking in this release
> build, so that doesn't explain it.
>
> //find words with slices .. ok this is slower by a bunch
> //finished! time: 153 ms
> void wcp_word_slices(string fn)
> {
> 	string input = cast(string)std.file.read(fn);
> 	ulong w_cnt;
> 	while (!input.empty)
> 	{
> 		auto c = input[0];
>     		if (c>= 'a'&&  c<= 'z' ||
> 			c>= 'A'&&  c<= 'Z')
> 		{
> 			++w_cnt;
> 			auto word = input;
>    			foreach(j,char w ;input){
> 				if 	(!(w>= 'a'&&  w<= 'z' ||
> 					   w>= 'A'&&  w<= 'Z' ||
> 					   w>= '0'&&  w<= '9'))
> 				{
> 					word = input[0..j];
> 					input = input[j..$];
> 					break;
> 				}
> 			}
> 		}
> 		else input = input[1..$];
> 	}
> }
>
> Note that using std.ascii.isXx is actually calling a function,
> which I find surprising with all the template code emphasis.
> Note that that it added another 80ms vs the in-line code above.
> So I would say provide a mixin template for this, because it is
> currently being reused by several libraries.
>
> //find words with std.ascii.isAlpha and isAlphaNum .. worse
> //finished! time: 232 ms
> void wcp_std_ascii (string fn)
> {
> 	string input = cast(string)std.file.read(fn);
> 	ulong w_cnt;
> 	while (!input.empty)
> 	{
> 		auto c = input[0];
>     		if (std.ascii.isAlpha(c))
> 		{
> 			++w_cnt;
> 			auto word = input;
>    			foreach(j,char w ;input){
> 				if 	(!std.ascii.isAlphaNum(w))
> 				{
> 					word = input[0..j];
> 					input = input[j..$];
> 					break;
> 				}
> 			}
> 		}
> 		else input = input[1..$];
> 	}
> }
>
>