Request for comments: std.d.lexer

Sat Feb 2 13:54:59 PST 2013

02-Feb-2013 01:10, Walter Bright пишет:
> On 2/1/2013 3:22 AM, Dmitry Olshansky wrote:
>> 01-Feb-2013 15:05, Walter Bright пишет:
>>> On 1/30/2013 8:44 AM, Dmitry Olshansky wrote:
>>>> In allocation scheme I proposed that ID could be a 32bit offset into
>>>> the unique
>>>> identifiers chunk.
>>>
>>> That only works if you know in advance the max size the chunk can ever
>>> be and preallocate it. Otherwise, you have no guarantee that the next
>>> allocated chunk will be within 32 bits of address of the previous
>>> chunks.
>>>
>>
>> Well I supposed it's exactly one reallocatable block. Then token have
>> an offset
>> that doesn't care if the block was reallocated.
>>
>> Or rather the reallocating just RESERVE virtual RAM for it (say 1G),
>> and COMMIT
>> it page by page when you need to grow it. Once lexing is done, shrink
>> virtual
>> region to the actual used size to free up address space (e.g. if we
>> are on 32bits).
>>
>> AS for 32bit limit that gives 4Gb maximum of the cumulative length of
>> all unique
>> identifier names is more then enough by any standard. I haven't seen a 4G
>> codebase not to speak of identifiers alone that even if we count all the
>> repetitions separately.
>
> Your technique can work, provided the number of identifiers isn't large
> enough that memory fragmentation will prevent being able to reallocate
> the buffer to a larger size.
>

I think I failed to describe the virtual memory trick or you just 
ignored it? But regardless I have to correct myself on both the amount 
to reserve and the fact that it's actually not virtual memory alone but 
a virtue of MMU.

See below a full program in D see it the trick in action, currently 
Win32 only, as I can't recall the right POSIX calls offhand. It's a 
benchmark against built-in array/appender - the heap stuff.

The speed is simillar, with my quick and dirty stuff being much faster 
iff arrays don't allocate all of ram beforehand.

The steps to snatch the power of not having any reallocations:

1. Reserve a contiguous *address space* range. Not even a bit of virtual 
memory is reserved/commited/touched at this moment. This just creates a 
TLB entry AFAICT.

(for lexer this address space range would have length equal to the sum 
of all files length as Tove points out, this is the first correction)

2. Any time there is no memory to put more stuff into (as is initially), 
commit the next few pages. At this moment virtual ram is committed but 
not allocated.

3. [this I think we all know] The moment memory location in the 
committed part of the range is touched the page for it is allocated (and 
on win32 initially contains zeros automatically). Only at this point 
Virtual memory is allocated

So you see where my 2nd correction goes. In essence you get: 0 
reallocations and no more then 1 page of virtual memory wasted in all 
cases. Sweet deal ;)

Then there is an optional step if you think that too much of address 
space is used (makes sense only on 32-bits if at all): copy the 
resulting block of actual data elsewhere in the heap and release the 
whole address range.

The code:
(the same as github gist: https://gist.github.com/4699388 )

///Written in the D programming language

// Bench built-in array append, std.array.Appender
// and custom virtual-memory aware VirtualArray in terms of
// memory usage and the time taken
// to append up to X megabytes using different chunk sizes:
// 16 bytes, 64bytes, 256 bytes and 16Kb at a time.

// pass this version to take insert readln's
// at interesting points and investigate RAM usage
// make sure to enable relevant columns in task manager process details:
// committed size and private working set
// version = diagnose;
import std.c.windows.windows;

import std.exception, std.datetime, std.algorithm, std.range, std.array;

auto roundUpToPowerOf2(size_t var, size_t pow2)
{
      return (var + pow2-1) & ~(pow2-1);
}

struct VirtualArray
{
private:
     ubyte* pointer;     // to the beginning of the reserved memory
     size_t _length;     // length of data
     size_t commited;   // committed memory so far
     size_t reserved;   // total possible maximum to grow to
     size_t pageSize;   // page size, this could be global
     //size_t pageSize;

     // commit some more memory from the reserved range
     void extend(size_t size)
     {
         size = roundUpToPowerOf2(size, pageSize);
         // this will throw once run out of reserved pages
         enforce(VirtualAlloc(pointer+commited, size,
             MEM_COMMIT, PAGE_READWRITE));
         commited += size;
     }
public:
     this(size_t maxCapacity)
     {
         SYSTEM_INFO sysinfo;
         GetSystemInfo(&sysinfo);
         pageSize = sysinfo.dwPageSize;
         // the page size is a power of 2
         // round the capacity up to a multiple of page size
         maxCapacity = roundUpToPowerOf2(maxCapacity, pageSize);
         pointer = cast(ubyte*)enforce(VirtualAlloc(null, maxCapacity,
             MEM_RESERVE, PAGE_READWRITE));
         commited = 0;
         reserved = maxCapacity;
         _length = 0;
     }

     // bare minimum primitives to run benchmark
     ref ubyte opIndex(size_t idx)
     {
         assert(idx < length);
         return pointer[idx];
     }

     // ditto
     ubyte[] opSlice(size_t from, size_t to)
     {
         assert(from < to && to <= length);
         return pointer[from..to];
     }

     // ditto
     @property size_t length(){ return _length; }

     void opOpAssign(string op:"~")(ubyte[] data)
     {
         size_t end = length + data.length;
         while(end > commited)
             extend(end-commited);
         pointer[length..end] = data[];
         _length = end;
     }

     ~this()
     {
         // should not throw, struct is not copyable
         enforce(VirtualFree(pointer, 0, MEM_RELEASE));
     }
}

import std.stdio;

void timeIt(string prefix, void delegate() run)
{
     StopWatch sw;
     sw.start();
     run();
     sw.stop();
     writefln(prefix~ " %4s ms", sw.peek().msecs);
}

bool verify(Arr)(ubyte[] pattern, ref Arr arr)
{
     for(size_t i=0; i<arr.length; i+= pattern.length)
     {

         if(arr[i..i+pattern.length] != pattern[])
         {
             writeln(arr[i .. i+pattern.length], " vs ", pattern);
             return false;
         }
     }
     return true;
}

void main()
{
     size_t totalSize = 120*2^^20;
     auto chunks = [
         iota(0, 16).map!(x=>cast(ubyte)x).array,
         iota(0, 64).map!(x=>cast(ubyte)x).array,
         iota(0, 256).map!(x=>cast(ubyte)x).array,
         iota(0, 2^^10).map!(x=>cast(ubyte)x).array
     ];
     auto titles = [ " 16b", " 64b", "256b", " 16K" ];
     import core.memory;
     GC.disable(); // to prevent measuring a collection by mistake
     foreach(n, chunk; chunks)
     {
         // reserve memory anew on each iteration
         // I was able to easily go up to 1.5 G on 32bits
         // note: there are no reallocations here at all
         version(diagnose)
         if(n == 0)
         {
             writeln("Before reserving address space");
             readln();
         }
         auto virtArr = VirtualArray(totalSize);
         version(diagnose)
         if(n == 0)
         {
             writeln("Reserved address space");
             readln();
         }
         timeIt(titles[n]~" virtual", (){
             foreach(i; 0..totalSize/chunk.length)
             {
                 virtArr ~= chunk;
             }
         });
         version(diagnose)
         if(n == 0)
         {
             writeln("Commited all of virtual memory");
             readln(); //uncomment to take a pause to investigate RAM usage
         }
         // time to verify is the same for all with -O -inline
         enforce(verify(chunk, virtArr));
     }
     writeln("============");
     foreach(n, chunk; chunks)
     {
         ubyte[] arr;
         // the GC is out of the game as soon as 512 Mbytes on 32bits
         // as I hit OutOfMemory on a 2nd iteration
         // try to help poor runtime
         // without reserve it crashes and burns at about 120 M
         // with reserve it's on par with chunk >= 256
         // but it _commits_ all memory beforehand !
         version(diagnose)
         if(n == 0)
         {
             writeln("Before array.reserve()");
             readln();
         }
         arr.reserve(totalSize);
         version(diagnose)
         if(n == 0)
         {
             writeln("After array.reserve()");
             readln();
         }
         timeIt(titles[n]~" array ", (){
             foreach(i; 0..totalSize/chunk.length)
             {
                 arr ~= chunk;
             }
         });
         version(diagnose)
         if(n == 0)
         {
             writeln("After all data touched");
             readln();
         }
         // time to verify is the same for all with -O -inline
         enforce(verify(chunk, arr));
	// sadly need to not run out of memory on 32 bits
         delete arr;
     }
     writeln("============");
     foreach(n, chunk; chunks)
     {
         {
             // appender is supposed to be faster then array on appends
             // but it's actually slower starting at 256 byte chunks
             // and esp. so with 16Kb chunks
             auto app = appender!(ubyte[])();
             timeIt(titles[n]~" appender", (){
                 foreach(i; 0..totalSize/chunk.length)
                 {
                     app.put(chunk);
                 }
             });
             auto appArr = app.data();
             enforce(verify(chunk, appArr));
         }
         GC.collect();
     }
}

-- 
Dmitry Olshansky