why allocation of large amount of small objects so slow (x10) in D?

Thu May 21 18:34:24 PDT 2009

nobody wrote:
> $ g++     alloc.cpp   -o alloc
> $ time ./alloc
> real    0m1.946s
> user    0m1.688s
> sys     0m0.256s
> 
> $ dmd -O -release allocd.d
> $ time ./allocd
> real    0m22.734s
> user    0m22.353s
> sys     0m0.360s
> 
> $ cat alloc.cpp
> #include <vector>
> 
> typedef std::vector<int> intvec;
> typedef intvec* intvecp;
> 
> int main() {
>   int i, n = 20000000;
>   intvecp* iva;
>   iva = new intvecp[n];
>   for (i = n; i-- > 0; ) {
>     iva[i] = new intvec();
>   }
> 
>   return 0;
> }
> 
> $ cat allocd.d
> int main() {
>   int i, n = 20000000;
>   Object[] oa;
>   oa = new Object[n];
>   for (i = n; i-- > 0; ) {
>     oa[i] = new Object();
>   }
> 
>   return 0;
> }

I use this a structure for arena-based memory allocation (attached).

Example of use:

import candy.util.MemPool

MemStack!() stack;

class MyObject
{
	mixin MemPoolNew!(stack);
}

int main()
{
	stack.push();
	int i, n = 20000000;
	MyObject[] oa;
	oa = new MyObject[n];
	for (i = n; i-- > 0; )
	{
		oa[i] = new MyObject();
	}
	stack.pop();
	return 0;
}

The push() and pop() allows memory to be allocated and deallocated as 
large blocks. However, you shouldn't need to deallocate manually -- it's 
GCed memory, so ideally the GC should free it when it's no longer 
referenced. That being said, I've run some tests, and the GC will free 
it *eventually*, but it allocates 4-6x as much memory as it needs before 
it starts freeing it, even when GC.collect() is called manually.

--------------------

/**
  * Provides a pool of GCed memory to allocate things from a block.
  * This maintains cache coherency for related types (i.e. tree nodes).
  * It doesn't garuntee any ordering, though, the array struct should be
  * used for that. Also, everything has to be freed at once, freeing one
  * portion of this has no effect.
  *
  * Based on a similar concept posted by bearophile at:
  * 
http://www.digitalmars.com/webnews/newsgroups.php?art_group=digitalmars.D&article_id=88227
  */
public struct MemPool(size_t BLOCK_SIZE = 1 << 14)
{
     private void* next; // Next available block
     private void* end;  // End of the current block
     private void*[] blocks;

     public void* alloc(size_t sz)
     {
     	sz = ((sz + 7) & ~7); // Next multiple of 8 if this isn't a 
multiple of 8
     	if (this.next + sz >= this.end)
     	{
     		void* blk = GC.calloc(BLOCK_SIZE);
     		this.blocks.length = this.blocks.length + 1;
     		this.blocks[$ - 1] = blk;
     		this.next = blk;
     		this.end = blk + BLOCK_SIZE;
         }

         void* ret = this.next;
         this.next += sz;
         return ret;
     }

     public void free()
     {
     	foreach(blk; this.blocks)
     		GC.free(blk);
     	this.blocks = null;
     	this.blocks.length = 0;
     	this.next = null;
     	this.end = null;
     }
}

/**
  * Wrapper for MemPool that allocates the given struct
  */
public struct StructPool(T)
{
	private MemPool!() pool;
	public T* alloc()  { return cast(T*) pool.alloc(T.sizeof); }
}

public struct MemStack(size_t BLOCK_SIZE = 1 << 14)
{
	private Stack!(MemPool!(BLOCK_SIZE)*, 16, true, true) stack;
	public static const size_t MAX_ALLOC = BLOCK_SIZE;

	public void* alloc(size_t sz) { return stack.peek().alloc(sz);          }
	public void  push()           { stack.push(new MemPool!(BLOCK_SIZE));   }
	public void  pop()            { stack.pop().free();                     }
}

/**
  * Placement new mixin for allocating from a memory pool. Benchmarks 
show this
  * as faster than the D new in real usage (i.e. the parser runs about 1.2x
  * faster using this).
  */
public template MemPoolNew(alias Pool)
{
	version(NoMemPool) { } else
	{
		public final new(uint sz)    { return Pool.alloc(sz); }
		public final delete(void *p) {                        }
	}
}