How to use sets in D?

mark mark at qtrac.eu
Sun Mar 8 08:43:10 UTC 2020


I use sets a lot and am now working on a program that will need 
to hold sets of 65,000+ items, so I thought I do some timings for 
the different approaches.

Here are some timings (uset uses the AA Unit approach, tset uses 
an rbtree, and aset uses an AA with bool values):

$ ./sets.d
size 50,000
uset 1 ms, 340 μs, and 8 hnsecs
tset 4 ms, 637 μs, and 1 hnsec
aset 1 ms, 402 μs, and 6 hnsecs
$ ./sets.d
size 100,000
uset 2 ms, 338 μs, and 4 hnsecs
tset 12 ms, 262 μs, and 6 hnsecs
aset 2 ms and 991 μs
$ ./sets.d
size 200,000
uset 5 ms, 971 μs, and 5 hnsecs
tset 30 ms, 675 μs, and 5 hnsecs
aset 6 ms, 74 μs, and 6 hnsecs
$ ./sets.d
size 400,000
uset 11 ms, 823 μs, and 4 hnsecs
tset 74 ms, 146 μs, and 2 hnsecs
aset 12 ms, 560 μs, and 5 hnsecs

What seems pretty clear is that for my purposes (checking 
presence or absence of membership in a set), AAs are much faster 
than rbtrees. (This is to be expected since if an AA uses a hash 
is should be O(1) vs O(lg n) for an rbtree).

Here is the test code I used:

#!/usr/bin/env rdmd

enum SIZE = 400_000;
enum SUB_SIZE = SIZE / 10;
enum MIN_LEN = 10;
enum MAX_LEN = 50;
enum AZ = "abcdefghijklmnopqrstuvwxyz";

void main() {
     import std.stdio: writefln;

     auto data = Data.populate();
     auto sets = Sets(data.words);

     writefln("size %,d", SIZE);
     check(sets.uset, data.present, data.absent, "uset");
     check(sets.tset, data.present, data.absent, "tset");
     check(sets.aset, data.present, data.absent, "aset");
}

struct Sets {
     import std.container.rbtree: RedBlackTree;

     alias Unit = void[0];
     enum unit = Unit.init;

     Unit[string] uset;
     RedBlackTree!string tset;
     bool[string] aset;

     this(string[] words) {
         tset = new RedBlackTree!string;
         foreach (word; words) {
             uset[word] = unit;
             tset.insert(word);
             aset[word] = false;
         }
     }
}

struct Data {
     string[] words;
     string[] present;
     string[] absent;

     static Data populate() {
         Data data;
         for (int i = 0; i < SIZE; i++) {
             auto word = makeWord;
             data.words ~= word;
             if (data.present.length < SUB_SIZE)
                 data.present ~= word;
             if (data.absent.length < SUB_SIZE)
                 data.absent ~= word ~ "9";
         }
         return data;
     }
}

string makeWord() {
     import std.random: randomCover, uniform;
     import std.range: take;
     import std.string: join, split;

     enum AZS = (AZ ~ AZ ~ AZ ~ AZ ~ AZ ~ AZ).split("");
     return randomCover(AZS).take(uniform(MIN_LEN, 
MAX_LEN)).join("");
}

void check(T)(T set, string[] present, string[] absent, string 
name) {
     import std.datetime.stopwatch: AutoStart, StopWatch;
     import std.stdio: writeln;

     auto timer = StopWatch(AutoStart.yes);
     foreach (string p; present)
         assert(p in set);
     foreach (string a; absent)
         assert(a !in set);
     writeln(name, " ", timer.peek);
}


More information about the Digitalmars-d-learn mailing list