Latest string_token Code
Ben Hanson
Ben.Hanson at tfbplc.co.uk
Tue Jun 22 06:13:06 PDT 2010
Here's the latest with naming convention (hopefully) followed. I've implemented my
own squeeze() function and used sizeof in the memmove calls.
How can I specify wide strings for the literals?
Thanks,
Ben
module main;
import std.algorithm;
import std.array;
import std.c.string;
import std.string;
import std.stdio;
template regex(CharT)
{
struct BasicStringToken
{
bool negated = false;
CharT[] charset;
enum size_t MAX_CHARS = CharT.max + 1;
enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
this(const bool negated_, ref CharT[] charset_)
{
negated = negated_;
charset = charset_;
}
void removeDuplicates()
{
charset.sort;
squeeze(charset);
}
void normalise()
{
if (charset.length == MAX_CHARS)
{
negated = !negated;
charset.clear();
}
else if (charset.length > MAX_CHARS / 2)
{
negate();
}
}
void negate()
{
CharT curr_char = START_CHAR;
CharT[] temp;
CharT *ptr = cast(CharT *) 0;
CharT *curr = charset.ptr;
CharT *end = curr + charset.length;
size_t i = 0;
negated = !negated;
temp.length = MAX_CHARS - charset.length;
ptr = temp.ptr;
while (curr < end)
{
while (*curr > curr_char)
{
*ptr = curr_char;
++ptr;
++curr_char;
++i;
}
++curr_char;
++curr;
++i;
}
for (; i < MAX_CHARS; ++i)
{
*ptr = curr_char;
++ptr;
++curr_char;
}
charset = temp;
}
bool empty()
{
return charset.length == 0 && !negated;
}
bool any()
{
return charset.length == 0 && negated;
}
void clear()
{
negated = false;
charset.length = 0;
}
void intersect(ref BasicStringToken rhs,
ref BasicStringToken overlap)
{
if ((any() && rhs.any()) || (negated == rhs.negated &&
!any() && !rhs.any()))
{
intersectSameTypes(rhs, overlap);
}
else
{
intersectDiffTypes(rhs, overlap);
}
}
private:
void intersectSameTypes(ref BasicStringToken rhs,
ref BasicStringToken overlap)
{
if (any())
{
clear();
overlap.negated = true;
rhs.clear();
}
else
{
CharT *iter = charset.ptr;
CharT *end = iter + charset.length;
CharT *rhs_iter = rhs.charset.ptr;
CharT *rhs_end = rhs_iter + rhs.charset.length;
overlap.negated = negated;
while (iter != end && rhs_iter != rhs_end)
{
if (*iter < *rhs_iter)
{
++iter;
}
else if (*iter > *rhs_iter)
{
++rhs_iter;
}
else
{
overlap.charset ~= *iter;
memmove(iter, iter + 1, (charset.ptr +
charset.length - iter) * CharT.sizeof);
--end;
charset.length -= 1;
memmove(rhs_iter, rhs_iter + 1, (rhs.charset.ptr +
rhs.charset.length - rhs_iter) * CharT.sizeof);
--rhs_end;
rhs.charset.length -= 1;
}
}
if (negated)
{
// duplicates already merged
// src, dest
merge(charset, overlap.charset);
// duplicates already merged
// src, dest
merge(rhs.charset, overlap.charset);
negated = false;
rhs.negated = false;
swap(charset, rhs.charset);
normalise();
overlap.normalise();
rhs.normalise();
}
else if (!overlap.charset.length == 0)
{
normalise();
overlap.normalise();
rhs.normalise();
}
}
}
void intersectDiffTypes(ref BasicStringToken rhs,
ref BasicStringToken overlap)
{
if (any())
{
intersectAny(rhs, overlap);
}
else if (negated)
{
intersectNegated(rhs, overlap);
}
else // negated == false
{
intersectCharset(rhs, overlap);
}
}
void intersectAny(ref BasicStringToken rhs, ref BasicStringToken overlap)
{
if (rhs.negated)
{
rhs.intersectNegated(this, overlap);
}
else // rhs.negated == false
{
rhs.intersectCharset(this, overlap);
}
}
void intersectNegated(ref BasicStringToken rhs,
ref BasicStringToken overlap)
{
if (rhs.any())
{
overlap.negated = true;
overlap.charset = charset;
rhs.negated = false;
rhs.charset = charset;
clear();
}
else // rhs.negated == false
{
rhs.intersectCharset(this, overlap);
}
}
void intersectCharset(ref BasicStringToken rhs,
ref BasicStringToken overlap)
{
if (rhs.any())
{
overlap.charset = charset;
rhs.negated = true;
rhs.charset = charset;
clear();
}
else // rhs.negated == true
{
CharT *iter = charset.ptr;
CharT *end = iter + charset.length;
CharT *rhs_iter = rhs.charset.ptr;
CharT *rhs_end = rhs_iter + rhs.charset.length;
while (iter != end && rhs_iter != rhs_end)
{
if (*iter < *rhs_iter)
{
overlap.charset ~= *iter;
rhs.charset.length += 1;
rhs_iter = rhs.charset.ptr;
rhs_end = rhs_iter + rhs.charset.length;
memmove(rhs_iter + 1, rhs_iter, (rhs.charset.length -
(rhs_end - rhs_iter - 1)) * CharT.sizeof);
++rhs_iter;
memmove(iter, iter + 1, (charset.ptr +
charset.length - iter) * CharT.sizeof);
charset.length -= 1;
--end;
}
else if (*iter > *rhs_iter)
{
++rhs_iter;
}
else
{
++iter;
++rhs_iter;
}
}
if (iter != end)
{
CharT[] temp;
temp.length = end - iter;
memmove(temp.ptr, iter, temp.length * CharT.sizeof);
// nothing bigger in rhs than iter
// src, dest
merge(temp, overlap.charset);
memmove(iter, iter + 1, (charset.ptr +
charset.length - iter) * CharT.sizeof);
charset.length -= 1;
}
if (!overlap.charset.empty())
{
merge(overlap.charset, rhs.charset);
// possible duplicates, so check for any and erase.
squeeze(rhs.charset);
normalise();
overlap.normalise();
rhs.normalise();
}
}
}
void squeeze(ref CharT[] str)
{
if (str.length > 1)
{
CharT *write = str.ptr;
CharT *end = write + str.length;
CharT *read = write + 1;
while (read != end)
{
while (read != end && *read == *write)
{
++read;
}
if (read == end) break;
++write;
if (read > write)
{
*write = *read;
}
++read;
}
str.length = write + 1 - str.ptr;
}
}
void merge(ref CharT[] src, ref CharT[] dest)
{
CharT[] temp;
CharT *ptr;
CharT *iter = src.ptr;
CharT *end = iter + src.length;
CharT *dest_iter = dest.ptr;
CharT *dest_end = dest_iter + dest.length;
temp.length = src.length + dest.length;
ptr = temp.ptr;
while (iter != end && dest_iter != dest_end)
{
if (*iter < *dest_iter)
{
*ptr++ = *iter++;
}
else
{
*ptr++ = *dest_iter++;
}
}
while (iter != end)
{
*ptr++ = *iter++;
}
while (dest_iter != dest_end)
{
*ptr++ = *dest_iter++;
}
dest = temp;
}
};
}
int main(char[][]argv)
{
regex!(char).BasicStringToken lhs;
regex!(char).BasicStringToken rhs;
regex!(char).BasicStringToken intersect;
lhs.charset = "aaabbc".dup;
lhs.negated = true;
lhs.removeDuplicates();
rhs.charset = "bccddd".dup;
rhs.negated = true;
rhs.removeDuplicates();
writeln(lhs.charset, '(', lhs.negated, ") intersect ",
rhs.charset, '(', rhs.negated, ") =");
lhs.intersect(rhs, intersect);
writeln(lhs.charset, '(', lhs.negated, "), ",
rhs.charset, '(', rhs.negated, "), ",
intersect.charset, '(', intersect.negated, ')');
return 0;
}
More information about the Digitalmars-d
mailing list