Latest string_token Code
Ben Hanson
Ben.Hanson at tfbplc.co.uk
Tue Jun 22 06:31:14 PDT 2010
== Quote from Rory McGuire (rmcguire at neonova.co.za)'s article
> On Tue, 22 Jun 2010 15:13:06 +0200, Ben Hanson <Ben.Hanson at tfbplc.co.uk>
> wrote:
> > Here's the latest with naming convention (hopefully) followed. I've
> > implemented my
> > own squeeze() function and used sizeof in the memmove calls.
> >
> > How can I specify wide strings for the literals?
> >
> > Thanks,
> >
> > Ben
> >
> > module main;
> >
> > import std.algorithm;
> > import std.array;
> > import std.c.string;
> > import std.string;
> >
> > import std.stdio;
> >
> > template regex(CharT)
> > {
> > struct BasicStringToken
> > {
> > bool negated = false;
> > CharT[] charset;
> > enum size_t MAX_CHARS = CharT.max + 1;
> > enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
> >
> > this(const bool negated_, ref CharT[] charset_)
> > {
> > negated = negated_;
> > charset = charset_;
> > }
> >
> > void removeDuplicates()
> > {
> > charset.sort;
> > squeeze(charset);
> > }
> >
> > void normalise()
> > {
> > if (charset.length == MAX_CHARS)
> > {
> > negated = !negated;
> > charset.clear();
> > }
> > else if (charset.length > MAX_CHARS / 2)
> > {
> > negate();
> > }
> > }
> >
> > void negate()
> > {
> > CharT curr_char = START_CHAR;
> > CharT[] temp;
> > CharT *ptr = cast(CharT *) 0;
> > CharT *curr = charset.ptr;
> > CharT *end = curr + charset.length;
> > size_t i = 0;
> >
> > negated = !negated;
> > temp.length = MAX_CHARS - charset.length;
> > ptr = temp.ptr;
> >
> > while (curr < end)
> > {
> > while (*curr > curr_char)
> > {
> > *ptr = curr_char;
> > ++ptr;
> > ++curr_char;
> > ++i;
> > }
> >
> > ++curr_char;
> > ++curr;
> > ++i;
> > }
> >
> > for (; i < MAX_CHARS; ++i)
> > {
> > *ptr = curr_char;
> > ++ptr;
> > ++curr_char;
> > }
> >
> > charset = temp;
> > }
> >
> > bool empty()
> > {
> > return charset.length == 0 && !negated;
> > }
> >
> > bool any()
> > {
> > return charset.length == 0 && negated;
> > }
> >
> > void clear()
> > {
> > negated = false;
> > charset.length = 0;
> > }
> >
> > void intersect(ref BasicStringToken rhs,
> > ref BasicStringToken overlap)
> > {
> > if ((any() && rhs.any()) || (negated == rhs.negated &&
> > !any() && !rhs.any()))
> > {
> > intersectSameTypes(rhs, overlap);
> > }
> > else
> > {
> > intersectDiffTypes(rhs, overlap);
> > }
> > }
> >
> > private:
> > void intersectSameTypes(ref BasicStringToken rhs,
> > ref BasicStringToken overlap)
> > {
> > if (any())
> > {
> > clear();
> > overlap.negated = true;
> > rhs.clear();
> > }
> > else
> > {
> > CharT *iter = charset.ptr;
> > CharT *end = iter + charset.length;
> > CharT *rhs_iter = rhs.charset.ptr;
> > CharT *rhs_end = rhs_iter + rhs.charset.length;
> >
> > overlap.negated = negated;
> >
> > while (iter != end && rhs_iter != rhs_end)
> > {
> > if (*iter < *rhs_iter)
> > {
> > ++iter;
> > }
> > else if (*iter > *rhs_iter)
> > {
> > ++rhs_iter;
> > }
> > else
> > {
> > overlap.charset ~= *iter;
> > memmove(iter, iter + 1, (charset.ptr +
> > charset.length - iter) * CharT.sizeof);
> > --end;
> > charset.length -= 1;
> > memmove(rhs_iter, rhs_iter + 1, (rhs.charset.ptr +
> > rhs.charset.length - rhs_iter) * CharT.sizeof);
> > --rhs_end;
> > rhs.charset.length -= 1;
> > }
> > }
> >
> > if (negated)
> > {
> > // duplicates already merged
> > // src, dest
> > merge(charset, overlap.charset);
> > // duplicates already merged
> > // src, dest
> > merge(rhs.charset, overlap.charset);
> > negated = false;
> > rhs.negated = false;
> > swap(charset, rhs.charset);
> > normalise();
> > overlap.normalise();
> > rhs.normalise();
> > }
> > else if (!overlap.charset.length == 0)
> > {
> > normalise();
> > overlap.normalise();
> > rhs.normalise();
> > }
> > }
> > }
> >
> > void intersectDiffTypes(ref BasicStringToken rhs,
> > ref BasicStringToken overlap)
> > {
> > if (any())
> > {
> > intersectAny(rhs, overlap);
> > }
> > else if (negated)
> > {
> > intersectNegated(rhs, overlap);
> > }
> > else // negated == false
> > {
> > intersectCharset(rhs, overlap);
> > }
> > }
> >
> > void intersectAny(ref BasicStringToken rhs, ref BasicStringToken
> > overlap)
> > {
> > if (rhs.negated)
> > {
> > rhs.intersectNegated(this, overlap);
> > }
> > else // rhs.negated == false
> > {
> > rhs.intersectCharset(this, overlap);
> > }
> > }
> >
> > void intersectNegated(ref BasicStringToken rhs,
> > ref BasicStringToken overlap)
> > {
> > if (rhs.any())
> > {
> > overlap.negated = true;
> > overlap.charset = charset;
> > rhs.negated = false;
> > rhs.charset = charset;
> > clear();
> > }
> > else // rhs.negated == false
> > {
> > rhs.intersectCharset(this, overlap);
> > }
> > }
> >
> > void intersectCharset(ref BasicStringToken rhs,
> > ref BasicStringToken overlap)
> > {
> > if (rhs.any())
> > {
> > overlap.charset = charset;
> > rhs.negated = true;
> > rhs.charset = charset;
> > clear();
> > }
> > else // rhs.negated == true
> > {
> > CharT *iter = charset.ptr;
> > CharT *end = iter + charset.length;
> > CharT *rhs_iter = rhs.charset.ptr;
> > CharT *rhs_end = rhs_iter + rhs.charset.length;
> >
> > while (iter != end && rhs_iter != rhs_end)
> > {
> > if (*iter < *rhs_iter)
> > {
> > overlap.charset ~= *iter;
> > rhs.charset.length += 1;
> > rhs_iter = rhs.charset.ptr;
> > rhs_end = rhs_iter + rhs.charset.length;
> > memmove(rhs_iter + 1, rhs_iter, (rhs.charset.length -
> > (rhs_end - rhs_iter - 1)) * CharT.sizeof);
> > ++rhs_iter;
> > memmove(iter, iter + 1, (charset.ptr +
> > charset.length - iter) * CharT.sizeof);
> > charset.length -= 1;
> > --end;
> > }
> > else if (*iter > *rhs_iter)
> > {
> > ++rhs_iter;
> > }
> > else
> > {
> > ++iter;
> > ++rhs_iter;
> > }
> > }
> >
> > if (iter != end)
> > {
> > CharT[] temp;
> >
> > temp.length = end - iter;
> > memmove(temp.ptr, iter, temp.length * CharT.sizeof);
> >
> > // nothing bigger in rhs than iter
> > // src, dest
> > merge(temp, overlap.charset);
> > memmove(iter, iter + 1, (charset.ptr +
> > charset.length - iter) * CharT.sizeof);
> > charset.length -= 1;
> > }
> >
> > if (!overlap.charset.empty())
> > {
> > merge(overlap.charset, rhs.charset);
> > // possible duplicates, so check for any and erase.
> > squeeze(rhs.charset);
> > normalise();
> > overlap.normalise();
> > rhs.normalise();
> > }
> > }
> > }
> >
> > void squeeze(ref CharT[] str)
> > {
> > if (str.length > 1)
> > {
> > CharT *write = str.ptr;
> > CharT *end = write + str.length;
> > CharT *read = write + 1;
> >
> > while (read != end)
> > {
> > while (read != end && *read == *write)
> > {
> > ++read;
> > }
> >
> > if (read == end) break;
> >
> > ++write;
> >
> > if (read > write)
> > {
> > *write = *read;
> > }
> >
> > ++read;
> > }
> >
> > str.length = write + 1 - str.ptr;
> > }
> > }
> >
> > void merge(ref CharT[] src, ref CharT[] dest)
> > {
> > CharT[] temp;
> > CharT *ptr;
> > CharT *iter = src.ptr;
> > CharT *end = iter + src.length;
> > CharT *dest_iter = dest.ptr;
> > CharT *dest_end = dest_iter + dest.length;
> >
> > temp.length = src.length + dest.length;
> > ptr = temp.ptr;
> >
> > while (iter != end && dest_iter != dest_end)
> > {
> > if (*iter < *dest_iter)
> > {
> > *ptr++ = *iter++;
> > }
> > else
> > {
> > *ptr++ = *dest_iter++;
> > }
> > }
> >
> > while (iter != end)
> > {
> > *ptr++ = *iter++;
> > }
> >
> > while (dest_iter != dest_end)
> > {
> > *ptr++ = *dest_iter++;
> > }
> >
> > dest = temp;
> > }
> > };
> > }
> >
> > int main(char[][]argv)
> > {
> > regex!(char).BasicStringToken lhs;
> > regex!(char).BasicStringToken rhs;
> > regex!(char).BasicStringToken intersect;
> >
> > lhs.charset = "aaabbc".dup;
> > lhs.negated = true;
> > lhs.removeDuplicates();
> > rhs.charset = "bccddd".dup;
> > rhs.negated = true;
> > rhs.removeDuplicates();
> > writeln(lhs.charset, '(', lhs.negated, ") intersect ",
> > rhs.charset, '(', rhs.negated, ") =");
> > lhs.intersect(rhs, intersect);
> > writeln(lhs.charset, '(', lhs.negated, "), ",
> > rhs.charset, '(', rhs.negated, "), ",
> > intersect.charset, '(', intersect.negated, ')');
> > return 0;
> > }
> "the string"w
> gives you 16bit I believe. postfix with a 'd' should give you 32bit.
Thanks. The problem now is that sort() corrupts the strings. Does anyone know why?
Regards,
Ben
More information about the Digitalmars-d
mailing list