[OT] The horizon of a stream
bearophile
bearophileHUGS at lycos.com
Fri Oct 24 00:39:15 PDT 2008
bearophile Wrote:
> There's a third possible solution, that is very fast: assuming the hash values are uint (32 bit), then you can create a bitfield of 2^32 bits, it requires just 1/2 GB, that is 1/4 of my RAM. So each bit represents a possible different value of the hash.
If the file is "small" my first implementation is good enough (small means the associative array has to fit in for example 2 GB). For larger files here's the implementation of the third idea, with D1, Phobos + my libs:
import d.all, std.string, std.c.stdlib, std.outofmemory;
const uint bpc = uint.sizeof * 8;
bool isFlagSet(uint* flags, uint i) {
int offset = i / bpc;
uint mask = 1 << (i % bpc);
return (flags[offset] & mask) != 0;
}
void setFlag(uint* flags, uint i) {
int offset = i / bpc;
uint mask = 1 << (i % bpc);
if ((flags[offset] & mask) == 0)
flags[offset] |= mask;
}
void main() {
uint* seen = cast(uint*)calloc(1 << 29, 1);
uint* seen_again = cast(uint*)calloc(1 << 29, 1);
if (seen is null || seen_again is null)
throw new OutOfMemoryException();
foreach (line; xfile("data.txt")) {
uint h = hash(line.chomp());
if (isFlagSet(seen, h))
// not used std.intrinsics, because they segfault here
setFlag(seen_again, h);
else
setFlag(seen, h);
}
free(seen);
int[string] aa;
int horizon;
foreach (nline, line; xfile("data.txt")) {
auto clean_line = line.chomp();
if (isFlagSet(seen_again, hash(clean_line))) {
auto pos_ptr = clean_line in aa;
if (pos_ptr) {
if ((nline - *pos_ptr) > horizon)
horizon = nline - *pos_ptr;
} else
aa[clean_line.dup] = nline;
}
}
putr(horizon);
}
With D1, just Phobos:
import std.stdio, std.string, std.c.stdlib, std.stream, std.outofmemory;
const uint bpc = uint.sizeof * 8;
bool isFlagSet(uint* flags, uint i) {
int offset = i / bpc;
uint mask = 1 << (i % bpc);
return (flags[offset] & mask) != 0;
}
void setFlag(uint* flags, uint i) {
int offset = i / bpc;
uint mask = 1 << (i % bpc);
if ((flags[offset] & mask) == 0)
flags[offset] |= mask;
}
void main() {
uint* seen = cast(uint*)calloc(1 << 29, 1);
uint* seen_again = cast(uint*)calloc(1 << 29, 1);
if (seen is null || seen_again is null)
throw new OutOfMemoryException();
foreach (string line; new BufferedFile("data.txt")) {
auto clean_line = line.chomp();
uint h = typeid(string).getHash(&clean_line);
if (isFlagSet(seen, h))
// not used std.intrinsics, because they segfault here
setFlag(seen_again, h);
else
setFlag(seen, h);
}
free(seen);
int[string] aa;
int horizon, nline;
foreach (string line; new BufferedFile("data.txt")) {
auto clean_line = line.chomp();
if (isFlagSet(seen_again, typeid(string).getHash(&clean_line))) {
auto pos_ptr = clean_line in aa;
if (pos_ptr) {
if ((nline - *pos_ptr) > horizon)
horizon = nline - *pos_ptr;
} else
aa[clean_line.dup] = nline;
}
nline++;
}
writefln(horizon);
}
The worst case is like this:
A
B
C
D
E
E
D
C
B
A
In this case the aa associative array has to store half of the lines of the file, and it be too much.
As you can see with this little program I have found another bug in Phobos, the intrinsics in this case don't work (and I have improved the Record of my libs too, used in another version of this code).
Bye,
bearophile
More information about the Digitalmars-d
mailing list