Latest string_token Code
Rory McGuire
rmcguire at neonova.co.za
Tue Jun 22 08:14:50 PDT 2010
On Tue, 22 Jun 2010 16:37:38 +0200, Ben Hanson <Ben.Hanson at tfbplc.co.uk>
wrote:
> == Quote from Rory McGuire (rmcguire at neonova.co.za)'s article
>> On Tue, 22 Jun 2010 15:31:14 +0200, Ben Hanson <Ben.Hanson at tfbplc.co.uk>
>> wrote:
>> > == Quote from Rory McGuire (rmcguire at neonova.co.za)'s article
>> >> On Tue, 22 Jun 2010 15:13:06 +0200, Ben Hanson
>> <Ben.Hanson at tfbplc.co.uk>
>> >> wrote:
>> >> > Here's the latest with naming convention (hopefully) followed. I've
>> >> > implemented my
>> >> > own squeeze() function and used sizeof in the memmove calls.
>> >> >
>> >> > How can I specify wide strings for the literals?
>> >> >
>> >> > Thanks,
>> >> >
>> >> > Ben
>> >> >
>> >> > module main;
>> >> >
>> >> > import std.algorithm;
>> >> > import std.array;
>> >> > import std.c.string;
>> >> > import std.string;
>> >> >
>> >> > import std.stdio;
>> >> >
>> >> > template regex(CharT)
>> >> > {
>> >> > struct BasicStringToken
>> >> > {
>> >> > bool negated = false;
>> >> > CharT[] charset;
>> >> > enum size_t MAX_CHARS = CharT.max + 1;
>> >> > enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
>> >> >
>> >> > this(const bool negated_, ref CharT[] charset_)
>> >> > {
>> >> > negated = negated_;
>> >> > charset = charset_;
>> >> > }
>> >> >
>> >> > void removeDuplicates()
>> >> > {
>> >> > charset.sort;
>> >> > squeeze(charset);
>> >> > }
>> >> >
>> >> > void normalise()
>> >> > {
>> >> > if (charset.length == MAX_CHARS)
>> >> > {
>> >> > negated = !negated;
>> >> > charset.clear();
>> >> > }
>> >> > else if (charset.length > MAX_CHARS / 2)
>> >> > {
>> >> > negate();
>> >> > }
>> >> > }
>> >> >
>> >> > void negate()
>> >> > {
>> >> > CharT curr_char = START_CHAR;
>> >> > CharT[] temp;
>> >> > CharT *ptr = cast(CharT *) 0;
>> >> > CharT *curr = charset.ptr;
>> >> > CharT *end = curr + charset.length;
>> >> > size_t i = 0;
>> >> >
>> >> > negated = !negated;
>> >> > temp.length = MAX_CHARS - charset.length;
>> >> > ptr = temp.ptr;
>> >> >
>> >> > while (curr < end)
>> >> > {
>> >> > while (*curr > curr_char)
>> >> > {
>> >> > *ptr = curr_char;
>> >> > ++ptr;
>> >> > ++curr_char;
>> >> > ++i;
>> >> > }
>> >> >
>> >> > ++curr_char;
>> >> > ++curr;
>> >> > ++i;
>> >> > }
>> >> >
>> >> > for (; i < MAX_CHARS; ++i)
>> >> > {
>> >> > *ptr = curr_char;
>> >> > ++ptr;
>> >> > ++curr_char;
>> >> > }
>> >> >
>> >> > charset = temp;
>> >> > }
>> >> >
>> >> > bool empty()
>> >> > {
>> >> > return charset.length == 0 && !negated;
>> >> > }
>> >> >
>> >> > bool any()
>> >> > {
>> >> > return charset.length == 0 && negated;
>> >> > }
>> >> >
>> >> > void clear()
>> >> > {
>> >> > negated = false;
>> >> > charset.length = 0;
>> >> > }
>> >> >
>> >> > void intersect(ref BasicStringToken rhs,
>> >> > ref BasicStringToken overlap)
>> >> > {
>> >> > if ((any() && rhs.any()) || (negated == rhs.negated &&
>> >> > !any() && !rhs.any()))
>> >> > {
>> >> > intersectSameTypes(rhs, overlap);
>> >> > }
>> >> > else
>> >> > {
>> >> > intersectDiffTypes(rhs, overlap);
>> >> > }
>> >> > }
>> >> >
>> >> > private:
>> >> > void intersectSameTypes(ref BasicStringToken rhs,
>> >> > ref BasicStringToken overlap)
>> >> > {
>> >> > if (any())
>> >> > {
>> >> > clear();
>> >> > overlap.negated = true;
>> >> > rhs.clear();
>> >> > }
>> >> > else
>> >> > {
>> >> > CharT *iter = charset.ptr;
>> >> > CharT *end = iter + charset.length;
>> >> > CharT *rhs_iter = rhs.charset.ptr;
>> >> > CharT *rhs_end = rhs_iter + rhs.charset.length;
>> >> >
>> >> > overlap.negated = negated;
>> >> >
>> >> > while (iter != end && rhs_iter != rhs_end)
>> >> > {
>> >> > if (*iter < *rhs_iter)
>> >> > {
>> >> > ++iter;
>> >> > }
>> >> > else if (*iter > *rhs_iter)
>> >> > {
>> >> > ++rhs_iter;
>> >> > }
>> >> > else
>> >> > {
>> >> > overlap.charset ~= *iter;
>> >> > memmove(iter, iter + 1, (charset.ptr +
>> >> > charset.length - iter) * CharT.sizeof);
>> >> > --end;
>> >> > charset.length -= 1;
>> >> > memmove(rhs_iter, rhs_iter + 1,
>> (rhs.charset.ptr +
>> >> > rhs.charset.length - rhs_iter) *
>> >> CharT.sizeof);
>> >> > --rhs_end;
>> >> > rhs.charset.length -= 1;
>> >> > }
>> >> > }
>> >> >
>> >> > if (negated)
>> >> > {
>> >> > // duplicates already merged
>> >> > // src, dest
>> >> > merge(charset, overlap.charset);
>> >> > // duplicates already merged
>> >> > // src, dest
>> >> > merge(rhs.charset, overlap.charset);
>> >> > negated = false;
>> >> > rhs.negated = false;
>> >> > swap(charset, rhs.charset);
>> >> > normalise();
>> >> > overlap.normalise();
>> >> > rhs.normalise();
>> >> > }
>> >> > else if (!overlap.charset.length == 0)
>> >> > {
>> >> > normalise();
>> >> > overlap.normalise();
>> >> > rhs.normalise();
>> >> > }
>> >> > }
>> >> > }
>> >> >
>> >> > void intersectDiffTypes(ref BasicStringToken rhs,
>> >> > ref BasicStringToken overlap)
>> >> > {
>> >> > if (any())
>> >> > {
>> >> > intersectAny(rhs, overlap);
>> >> > }
>> >> > else if (negated)
>> >> > {
>> >> > intersectNegated(rhs, overlap);
>> >> > }
>> >> > else // negated == false
>> >> > {
>> >> > intersectCharset(rhs, overlap);
>> >> > }
>> >> > }
>> >> >
>> >> > void intersectAny(ref BasicStringToken rhs, ref
>> BasicStringToken
>> >> > overlap)
>> >> > {
>> >> > if (rhs.negated)
>> >> > {
>> >> > rhs.intersectNegated(this, overlap);
>> >> > }
>> >> > else // rhs.negated == false
>> >> > {
>> >> > rhs.intersectCharset(this, overlap);
>> >> > }
>> >> > }
>> >> >
>> >> > void intersectNegated(ref BasicStringToken rhs,
>> >> > ref BasicStringToken overlap)
>> >> > {
>> >> > if (rhs.any())
>> >> > {
>> >> > overlap.negated = true;
>> >> > overlap.charset = charset;
>> >> > rhs.negated = false;
>> >> > rhs.charset = charset;
>> >> > clear();
>> >> > }
>> >> > else // rhs.negated == false
>> >> > {
>> >> > rhs.intersectCharset(this, overlap);
>> >> > }
>> >> > }
>> >> >
>> >> > void intersectCharset(ref BasicStringToken rhs,
>> >> > ref BasicStringToken overlap)
>> >> > {
>> >> > if (rhs.any())
>> >> > {
>> >> > overlap.charset = charset;
>> >> > rhs.negated = true;
>> >> > rhs.charset = charset;
>> >> > clear();
>> >> > }
>> >> > else // rhs.negated == true
>> >> > {
>> >> > CharT *iter = charset.ptr;
>> >> > CharT *end = iter + charset.length;
>> >> > CharT *rhs_iter = rhs.charset.ptr;
>> >> > CharT *rhs_end = rhs_iter + rhs.charset.length;
>> >> >
>> >> > while (iter != end && rhs_iter != rhs_end)
>> >> > {
>> >> > if (*iter < *rhs_iter)
>> >> > {
>> >> > overlap.charset ~= *iter;
>> >> > rhs.charset.length += 1;
>> >> > rhs_iter = rhs.charset.ptr;
>> >> > rhs_end = rhs_iter + rhs.charset.length;
>> >> > memmove(rhs_iter + 1, rhs_iter,
>> >> (rhs.charset.length -
>> >> > (rhs_end - rhs_iter - 1)) * CharT.sizeof);
>> >> > ++rhs_iter;
>> >> > memmove(iter, iter + 1, (charset.ptr +
>> >> > charset.length - iter) * CharT.sizeof);
>> >> > charset.length -= 1;
>> >> > --end;
>> >> > }
>> >> > else if (*iter > *rhs_iter)
>> >> > {
>> >> > ++rhs_iter;
>> >> > }
>> >> > else
>> >> > {
>> >> > ++iter;
>> >> > ++rhs_iter;
>> >> > }
>> >> > }
>> >> >
>> >> > if (iter != end)
>> >> > {
>> >> > CharT[] temp;
>> >> >
>> >> > temp.length = end - iter;
>> >> > memmove(temp.ptr, iter, temp.length *
>> CharT.sizeof);
>> >> >
>> >> > // nothing bigger in rhs than iter
>> >> > // src, dest
>> >> > merge(temp, overlap.charset);
>> >> > memmove(iter, iter + 1, (charset.ptr +
>> >> > charset.length - iter) * CharT.sizeof);
>> >> > charset.length -= 1;
>> >> > }
>> >> >
>> >> > if (!overlap.charset.empty())
>> >> > {
>> >> > merge(overlap.charset, rhs.charset);
>> >> > // possible duplicates, so check for any and erase.
>> >> > squeeze(rhs.charset);
>> >> > normalise();
>> >> > overlap.normalise();
>> >> > rhs.normalise();
>> >> > }
>> >> > }
>> >> > }
>> >> >
>> >> > void squeeze(ref CharT[] str)
>> >> > {
>> >> > if (str.length > 1)
>> >> > {
>> >> > CharT *write = str.ptr;
>> >> > CharT *end = write + str.length;
>> >> > CharT *read = write + 1;
>> >> >
>> >> > while (read != end)
>> >> > {
>> >> > while (read != end && *read == *write)
>> >> > {
>> >> > ++read;
>> >> > }
>> >> >
>> >> > if (read == end) break;
>> >> >
>> >> > ++write;
>> >> >
>> >> > if (read > write)
>> >> > {
>> >> > *write = *read;
>> >> > }
>> >> >
>> >> > ++read;
>> >> > }
>> >> >
>> >> > str.length = write + 1 - str.ptr;
>> >> > }
>> >> > }
>> >> >
>> >> > void merge(ref CharT[] src, ref CharT[] dest)
>> >> > {
>> >> > CharT[] temp;
>> >> > CharT *ptr;
>> >> > CharT *iter = src.ptr;
>> >> > CharT *end = iter + src.length;
>> >> > CharT *dest_iter = dest.ptr;
>> >> > CharT *dest_end = dest_iter + dest.length;
>> >> >
>> >> > temp.length = src.length + dest.length;
>> >> > ptr = temp.ptr;
>> >> >
>> >> > while (iter != end && dest_iter != dest_end)
>> >> > {
>> >> > if (*iter < *dest_iter)
>> >> > {
>> >> > *ptr++ = *iter++;
>> >> > }
>> >> > else
>> >> > {
>> >> > *ptr++ = *dest_iter++;
>> >> > }
>> >> > }
>> >> >
>> >> > while (iter != end)
>> >> > {
>> >> > *ptr++ = *iter++;
>> >> > }
>> >> >
>> >> > while (dest_iter != dest_end)
>> >> > {
>> >> > *ptr++ = *dest_iter++;
>> >> > }
>> >> >
>> >> > dest = temp;
>> >> > }
>> >> > };
>> >> > }
>> >> >
>> >> > int main(char[][]argv)
>> >> > {
>> >> > regex!(char).BasicStringToken lhs;
>> >> > regex!(char).BasicStringToken rhs;
>> >> > regex!(char).BasicStringToken intersect;
>> >> >
>> >> > lhs.charset = "aaabbc".dup;
>> >> > lhs.negated = true;
>> >> > lhs.removeDuplicates();
>> >> > rhs.charset = "bccddd".dup;
>> >> > rhs.negated = true;
>> >> > rhs.removeDuplicates();
>> >> > writeln(lhs.charset, '(', lhs.negated, ") intersect ",
>> >> > rhs.charset, '(', rhs.negated, ") =");
>> >> > lhs.intersect(rhs, intersect);
>> >> > writeln(lhs.charset, '(', lhs.negated, "), ",
>> >> > rhs.charset, '(', rhs.negated, "), ",
>> >> > intersect.charset, '(', intersect.negated, ')');
>> >> > return 0;
>> >> > }
>> >
>> >> "the string"w
>> >> gives you 16bit I believe. postfix with a 'd' should give you 32bit.
>> >
>> > Thanks. The problem now is that sort() corrupts the strings. Does
>> anyone
>> > know why?
>> >
>> > Regards,
>> >
>> > Ben
>> perhaps from mixing wide chars with CharT if CharT is 8bits?
>> Honestly I havn't read your code but that is just the likely scenario.
>
> I don't think so:
>
> int main(char[][]argv)
> {
> regex!(wchar).BasicStringToken lhs;
> regex!(wchar).BasicStringToken rhs;
> regex!(wchar).BasicStringToken intersect;
>
> lhs.charset = "aaabbc"w.dup;
> lhs.negated = true;
> lhs.removeDuplicates();
> rhs.charset = "bccddd"w.dup;
> rhs.negated = true;
> rhs.removeDuplicates();
> writeln(lhs.charset, '(', lhs.negated, ") intersect ",
> rhs.charset, '(', rhs.negated, ") =");
> lhs.intersect(rhs, intersect);
> writeln(lhs.charset, '(', lhs.negated, "), ",
> rhs.charset, '(', rhs.negated, "), ",
> intersect.charset, '(', intersect.negated, ')');
> return 0;
> }
hmm, that does seem strange, it seems to work with char and dchar but not
wchar.
-Rory
More information about the Digitalmars-d
mailing list