Latest string_token Code

Rory McGuire rmcguire at neonova.co.za
Tue Jun 22 06:22:07 PDT 2010


"the string"w

gives you 16bit I believe. postfix with a 'd' should give you 32bit.



On Tue, 22 Jun 2010 15:13:06 +0200, Ben Hanson <Ben.Hanson at tfbplc.co.uk>  
wrote:

> Here's the latest with naming convention (hopefully) followed. I've  
> implemented my
> own squeeze() function and used sizeof in the memmove calls.
>
> How can I specify wide strings for the literals?
>
> Thanks,
>
> Ben
>
> module main;
>
> import std.algorithm;
> import std.array;
> import std.c.string;
> import std.string;
>
> import std.stdio;
>
> template regex(CharT)
> {
> struct BasicStringToken
> {
>     bool negated = false;
>     CharT[] charset;
>     enum size_t MAX_CHARS = CharT.max + 1;
>     enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
>
>     this(const bool negated_, ref CharT[] charset_)
>     {
>         negated = negated_;
>         charset = charset_;
>     }
>
>     void removeDuplicates()
>     {
>         charset.sort;
>         squeeze(charset);
>     }
>
>     void normalise()
>     {
>         if (charset.length == MAX_CHARS)
>         {
>             negated = !negated;
>             charset.clear();
>         }
>         else if (charset.length > MAX_CHARS / 2)
>         {
>             negate();
>         }
>     }
>
>     void negate()
>     {
>         CharT curr_char = START_CHAR;
>         CharT[] temp;
>         CharT *ptr = cast(CharT *) 0;
>         CharT *curr = charset.ptr;
>         CharT *end = curr + charset.length;
>         size_t i = 0;
>
>         negated = !negated;
>         temp.length = MAX_CHARS - charset.length;
>         ptr = temp.ptr;
>
>         while (curr < end)
>         {
>             while (*curr > curr_char)
>             {
>                 *ptr = curr_char;
>                 ++ptr;
>                 ++curr_char;
>                 ++i;
>             }
>
>             ++curr_char;
>             ++curr;
>             ++i;
>         }
>
>         for (; i < MAX_CHARS; ++i)
>         {
>             *ptr = curr_char;
>             ++ptr;
>             ++curr_char;
>         }
>
>         charset = temp;
>     }
>
>     bool empty()
>     {
>         return charset.length == 0 && !negated;
>     }
>
>     bool any()
>     {
>         return charset.length == 0 && negated;
>     }
>
>     void clear()
>     {
>         negated = false;
>         charset.length = 0;
>     }
>
>     void intersect(ref BasicStringToken rhs,
>         ref BasicStringToken overlap)
>     {
>         if ((any() && rhs.any()) || (negated == rhs.negated &&
>             !any() && !rhs.any()))
>         {
>             intersectSameTypes(rhs, overlap);
>         }
>         else
>         {
>             intersectDiffTypes(rhs, overlap);
>         }
>     }
>
> private:
>     void intersectSameTypes(ref BasicStringToken rhs,
>         ref BasicStringToken overlap)
>     {
>         if (any())
>         {
>             clear();
>             overlap.negated = true;
>             rhs.clear();
>         }
>         else
>         {
>             CharT *iter = charset.ptr;
>             CharT *end = iter + charset.length;
>             CharT *rhs_iter = rhs.charset.ptr;
>             CharT *rhs_end = rhs_iter + rhs.charset.length;
>
>             overlap.negated = negated;
>
>             while (iter != end && rhs_iter != rhs_end)
>             {
>                 if (*iter < *rhs_iter)
>                 {
>                     ++iter;
>                 }
>                 else if (*iter > *rhs_iter)
>                 {
>                     ++rhs_iter;
>                 }
>                 else
>                 {
>                     overlap.charset ~= *iter;
>                     memmove(iter, iter + 1, (charset.ptr +
>                         charset.length - iter) * CharT.sizeof);
>                     --end;
>                     charset.length -= 1;
>                     memmove(rhs_iter, rhs_iter + 1, (rhs.charset.ptr +
>                         rhs.charset.length - rhs_iter) * CharT.sizeof);
>                     --rhs_end;
>                     rhs.charset.length -= 1;
>                 }
>             }
>
>             if (negated)
>             {
>                 // duplicates already merged
>                 // src, dest
>                 merge(charset, overlap.charset);
>                 // duplicates already merged
>                 // src, dest
>                 merge(rhs.charset, overlap.charset);
>                 negated = false;
>                 rhs.negated = false;
>                 swap(charset, rhs.charset);
>                 normalise();
>                 overlap.normalise();
>                 rhs.normalise();
>             }
>             else if (!overlap.charset.length == 0)
>             {
>                 normalise();
>                 overlap.normalise();
>                 rhs.normalise();
>             }
>         }
>     }
>
>     void intersectDiffTypes(ref BasicStringToken rhs,
>         ref BasicStringToken overlap)
>     {
>         if (any())
>         {
>             intersectAny(rhs, overlap);
>         }
>         else if (negated)
>         {
>             intersectNegated(rhs, overlap);
>         }
>         else // negated == false
>         {
>             intersectCharset(rhs, overlap);
>         }
>     }
>
>     void intersectAny(ref BasicStringToken rhs, ref BasicStringToken  
> overlap)
>     {
>         if (rhs.negated)
>         {
>             rhs.intersectNegated(this, overlap);
>         }
>         else // rhs.negated == false
>         {
>             rhs.intersectCharset(this, overlap);
>         }
>     }
>
>     void intersectNegated(ref BasicStringToken rhs,
>         ref BasicStringToken overlap)
>     {
>         if (rhs.any())
>         {
>             overlap.negated = true;
>             overlap.charset = charset;
>             rhs.negated = false;
>             rhs.charset = charset;
>             clear();
>         }
>         else // rhs.negated == false
>         {
>             rhs.intersectCharset(this, overlap);
>         }
>     }
>
>     void intersectCharset(ref BasicStringToken rhs,
>         ref BasicStringToken overlap)
>     {
>         if (rhs.any())
>         {
>             overlap.charset = charset;
>             rhs.negated = true;
>             rhs.charset = charset;
>             clear();
>         }
>         else // rhs.negated == true
>         {
>             CharT *iter = charset.ptr;
>             CharT *end = iter + charset.length;
>             CharT *rhs_iter = rhs.charset.ptr;
>             CharT *rhs_end = rhs_iter + rhs.charset.length;
>
>             while (iter != end && rhs_iter != rhs_end)
>             {
>                 if (*iter < *rhs_iter)
>                 {
>                     overlap.charset ~= *iter;
>                     rhs.charset.length += 1;
>                     rhs_iter = rhs.charset.ptr;
>                     rhs_end = rhs_iter + rhs.charset.length;
>                     memmove(rhs_iter + 1, rhs_iter, (rhs.charset.length -
>                         (rhs_end - rhs_iter - 1)) * CharT.sizeof);
>                     ++rhs_iter;
>                     memmove(iter, iter + 1, (charset.ptr +
>                         charset.length - iter) * CharT.sizeof);
>                     charset.length -= 1;
>                     --end;
>                 }
>                 else if (*iter > *rhs_iter)
>                 {
>                     ++rhs_iter;
>                 }
>                 else
>                 {
>                     ++iter;
>                     ++rhs_iter;
>                 }
>             }
>
>             if (iter != end)
>             {
>                 CharT[] temp;
>
>                 temp.length = end - iter;
>                 memmove(temp.ptr, iter, temp.length * CharT.sizeof);
>
>                 // nothing bigger in rhs than iter
>                 // src, dest
>                 merge(temp, overlap.charset);
>                 memmove(iter, iter + 1, (charset.ptr +
>                     charset.length - iter) * CharT.sizeof);
>                 charset.length -= 1;
>             }
>
>             if (!overlap.charset.empty())
>             {
>                 merge(overlap.charset, rhs.charset);
>                 // possible duplicates, so check for any and erase.
>                 squeeze(rhs.charset);
>                 normalise();
>                 overlap.normalise();
>                 rhs.normalise();
>             }
>         }
>     }
>
>     void squeeze(ref CharT[] str)
>     {
>         if (str.length > 1)
>         {
>             CharT *write = str.ptr;
>             CharT *end = write + str.length;
>             CharT *read = write + 1;
>
>             while (read != end)
>             {
>                 while (read != end && *read == *write)
>                 {
>                     ++read;
>                 }
>
>                 if (read == end) break;
>
>                 ++write;
>
>                 if (read > write)
>                 {
>                     *write = *read;
>                 }
>
>                 ++read;
>             }
>
>             str.length = write + 1 - str.ptr;
>         }
>     }
>
>     void merge(ref CharT[] src, ref CharT[] dest)
>     {
>         CharT[] temp;
>         CharT *ptr;
>         CharT *iter = src.ptr;
>         CharT *end = iter + src.length;
>         CharT *dest_iter = dest.ptr;
>         CharT *dest_end = dest_iter + dest.length;
>
>         temp.length = src.length + dest.length;
>         ptr = temp.ptr;
>
>         while (iter != end && dest_iter != dest_end)
>         {
>             if (*iter < *dest_iter)
>             {
>                 *ptr++ = *iter++;
>             }
>             else
>             {
>                 *ptr++ = *dest_iter++;
>             }
>         }
>
>         while (iter != end)
>         {
>             *ptr++ = *iter++;
>         }
>
>         while (dest_iter != dest_end)
>         {
>             *ptr++ = *dest_iter++;
>         }
>
>         dest = temp;
>     }
> };
> }
>
> int main(char[][]argv)
> {
>     regex!(char).BasicStringToken lhs;
>     regex!(char).BasicStringToken rhs;
>     regex!(char).BasicStringToken intersect;
>
>     lhs.charset = "aaabbc".dup;
>     lhs.negated = true;
>     lhs.removeDuplicates();
>     rhs.charset = "bccddd".dup;
>     rhs.negated = true;
>     rhs.removeDuplicates();
>     writeln(lhs.charset, '(', lhs.negated, ") intersect ",
>         rhs.charset, '(', rhs.negated, ") =");
>     lhs.intersect(rhs, intersect);
>     writeln(lhs.charset, '(', lhs.negated, "), ",
>         rhs.charset, '(', rhs.negated, "), ",
>         intersect.charset, '(', intersect.negated, ')');
>     return 0;
> }


More information about the Digitalmars-d mailing list