Streaming library

Denis Koroskin 2korden at gmail.com
Wed Oct 13 19:20:13 PDT 2010


On Thu, 14 Oct 2010 03:47:12 +0400, Andrei Alexandrescu
<SeeWebsiteForEmail at erdani.org> wrote:

> On 10/13/2010 06:23 PM, Denis Koroskin wrote:
>> On Thu, 14 Oct 2010 03:06:30 +0400, Andrei Alexandrescu
>> <SeeWebsiteForEmail at erdani.org> wrote:
>>> Gnu offers two specialized routines:
>>> http://www.gnu.org/s/libc/manual/html_node/Line-Input.html. It is many
>>> times more efficient than anything that can be done in client code
>>> using the stdio API. I'm thinking along those lines.
>>>
>>
>> I can easily implement similar interface on top of chunked read:
>> ubyte[] readLine(ubyte[] lineBuffer); or bool readLine(ref ubyte[]
>> lineBuffer);
>
> You can't.
>
>> I've quickly looked through an implementation, too, and it's still
>> filling a buffer first, and then copying character byte-by-byte to the
>> output string (making realloc when needed) until a delimiter is found.
>> It is exactly as efficient as implemented externally.
>
> Except you don't have an interface to copy byte by byte. Oops...
>
>> It does the same
>> amount of copying and memory allocations. "Many times more efficient" is
>> just an overestimation.
>
> It's not. I measured because it was important in an application I was  
> working on. It's shocking how some seemingly minor changes can make a  
> big difference in throughput.
>
>> BTW, did you see my message about std.concurrency?
>
> Yes, but I'll need to leave the bulk of it to Sean. Thanks.
>
>
> Andrei

Okay. Now give me your best and tell me mine is slower (sorry for a lack
of comments):


enum BUFFER_SIZE = 16 * 1024;

import core.stdc.stdio;
import core.stdc.string;
import core.memory;

class InputStream
{
          this(const char* fileName)
          {
                  f = fopen(fileName, "r".ptr);
          }

          size_t read(ubyte[] buffer)
          {
                  return .fread(buffer.ptr, 1, buffer.length, f);
          }

          FILE* f;
}

struct ByLine
{
          this(InputStream inputStream, char delim = '\n')
          {
                  this.inputStream = inputStream;
                  this.delim = delim;
                  this.ptr = this.end = buffer.ptr;
          }

          private void refill()
          {
                  ptr = buffer.ptr;
                  end = ptr + inputStream.read(buffer);
          }

          ubyte[] readLine(ubyte[] line)
          {
                  if (ptr is null) {
                          return null;
                  }

                  ubyte* lineStart = line.ptr;
                  ubyte* linePtr = lineStart;
                  ubyte* lineEnd = lineStart + line.length;
                  while (true) {
                          ubyte* pos = cast(ubyte*)memchr(ptr, delim, end -
ptr);
                          if (pos is null) {
                                  int size = end - ptr;
                                  ubyte* newLinePtr = linePtr + size;
                                  if (newLinePtr > lineEnd) {
                                          size_t offset = linePtr -
lineStart;
                                          lineStart =
cast(ubyte*)GC.realloc(lineStart, newLinePtr - lineStart);
                                          linePtr = lineStart + offset;
                                          newLinePtr = linePtr + size;
                                  }
                                  memcpy(linePtr, ptr, size);
                                  linePtr = newLinePtr;

                                  refill();
                                  if (ptr !is end) {
                                          continue;
                                  }

                                  ptr = null;
                                  return lineStart[0..linePtr - lineStart];
                          }

                          int size = pos - ptr + 1;
                          ubyte* newLinePtr = linePtr + size;
                          if (newLinePtr > lineEnd) {
                                  size_t offset = linePtr - lineStart;
                                  lineStart =
cast(ubyte*)GC.realloc(lineStart, newLinePtr - lineStart);
                                  linePtr = lineStart + offset;
                                  newLinePtr = linePtr + size;
                          }
                          memcpy(linePtr, ptr, size);
                          linePtr = newLinePtr;

                          ptr = pos + 1;

                          return lineStart[0..linePtr - lineStart];
                  }
          }

          InputStream inputStream;
          ubyte* ptr;
          ubyte* end;
          ubyte buffer[BUFFER_SIZE];
          int delim;
}

int main()
{
          InputStream inputStream = new InputStream("very-large-file.txt");

          ubyte[] line = new ubyte[128];
          ByLine byLine = ByLine(inputStream);

          int numLines = 0;
          int numChars = 0;
          while (true) {
                  line = byLine.readLine(line);
                  if (line.ptr is null) {
                          break;
                  }

                  numChars += line.length;
                  numLines++;
          }

          printf("numLines: %d\n", numLines);
          printf("numChars: %d\n", numChars);

          return 0;
}


More information about the Digitalmars-d mailing list