Latest string_token Code
    Ben Hanson 
    Ben.Hanson at tfbplc.co.uk
       
    Tue Jun 22 06:13:06 PDT 2010
    
    
  
Here's the latest with naming convention (hopefully) followed. I've implemented my
own squeeze() function and used sizeof in the memmove calls.
How can I specify wide strings for the literals?
Thanks,
Ben
module main;
import std.algorithm;
import std.array;
import std.c.string;
import std.string;
import std.stdio;
template regex(CharT)
{
struct BasicStringToken
{
    bool negated = false;
    CharT[] charset;
    enum size_t MAX_CHARS = CharT.max + 1;
    enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
    this(const bool negated_, ref CharT[] charset_)
    {
        negated = negated_;
        charset = charset_;
    }
    void removeDuplicates()
    {
        charset.sort;
        squeeze(charset);
    }
    void normalise()
    {
        if (charset.length == MAX_CHARS)
        {
            negated = !negated;
            charset.clear();
        }
        else if (charset.length > MAX_CHARS / 2)
        {
            negate();
        }
    }
    void negate()
    {
        CharT curr_char = START_CHAR;
        CharT[] temp;
        CharT *ptr = cast(CharT *) 0;
        CharT *curr = charset.ptr;
        CharT *end = curr + charset.length;
        size_t i = 0;
        negated = !negated;
        temp.length = MAX_CHARS - charset.length;
        ptr = temp.ptr;
        while (curr < end)
        {
            while (*curr > curr_char)
            {
                *ptr = curr_char;
                ++ptr;
                ++curr_char;
                ++i;
            }
            ++curr_char;
            ++curr;
            ++i;
        }
        for (; i < MAX_CHARS; ++i)
        {
            *ptr = curr_char;
            ++ptr;
            ++curr_char;
        }
        charset = temp;
    }
    bool empty()
    {
        return charset.length == 0 && !negated;
    }
    bool any()
    {
        return charset.length == 0 && negated;
    }
    void clear()
    {
        negated = false;
        charset.length = 0;
    }
    void intersect(ref BasicStringToken rhs,
        ref BasicStringToken overlap)
    {
        if ((any() && rhs.any()) || (negated == rhs.negated &&
            !any() && !rhs.any()))
        {
            intersectSameTypes(rhs, overlap);
        }
        else
        {
            intersectDiffTypes(rhs, overlap);
        }
    }
private:
    void intersectSameTypes(ref BasicStringToken rhs,
        ref BasicStringToken overlap)
    {
        if (any())
        {
            clear();
            overlap.negated = true;
            rhs.clear();
        }
        else
        {
            CharT *iter = charset.ptr;
            CharT *end = iter + charset.length;
            CharT *rhs_iter = rhs.charset.ptr;
            CharT *rhs_end = rhs_iter + rhs.charset.length;
            overlap.negated = negated;
            while (iter != end && rhs_iter != rhs_end)
            {
                if (*iter < *rhs_iter)
                {
                    ++iter;
                }
                else if (*iter > *rhs_iter)
                {
                    ++rhs_iter;
                }
                else
                {
                    overlap.charset ~= *iter;
                    memmove(iter, iter + 1, (charset.ptr +
                        charset.length - iter) * CharT.sizeof);
                    --end;
                    charset.length -= 1;
                    memmove(rhs_iter, rhs_iter + 1, (rhs.charset.ptr +
                        rhs.charset.length - rhs_iter) * CharT.sizeof);
                    --rhs_end;
                    rhs.charset.length -= 1;
                }
            }
            if (negated)
            {
                // duplicates already merged
                // src, dest
                merge(charset, overlap.charset);
                // duplicates already merged
                // src, dest
                merge(rhs.charset, overlap.charset);
                negated = false;
                rhs.negated = false;
                swap(charset, rhs.charset);
                normalise();
                overlap.normalise();
                rhs.normalise();
            }
            else if (!overlap.charset.length == 0)
            {
                normalise();
                overlap.normalise();
                rhs.normalise();
            }
        }
    }
    void intersectDiffTypes(ref BasicStringToken rhs,
        ref BasicStringToken overlap)
    {
        if (any())
        {
            intersectAny(rhs, overlap);
        }
        else if (negated)
        {
            intersectNegated(rhs, overlap);
        }
        else // negated == false
        {
            intersectCharset(rhs, overlap);
        }
    }
    void intersectAny(ref BasicStringToken rhs, ref BasicStringToken overlap)
    {
        if (rhs.negated)
        {
            rhs.intersectNegated(this, overlap);
        }
        else // rhs.negated == false
        {
            rhs.intersectCharset(this, overlap);
        }
    }
    void intersectNegated(ref BasicStringToken rhs,
        ref BasicStringToken overlap)
    {
        if (rhs.any())
        {
            overlap.negated = true;
            overlap.charset = charset;
            rhs.negated = false;
            rhs.charset = charset;
            clear();
        }
        else // rhs.negated == false
        {
            rhs.intersectCharset(this, overlap);
        }
    }
    void intersectCharset(ref BasicStringToken rhs,
        ref BasicStringToken overlap)
    {
        if (rhs.any())
        {
            overlap.charset = charset;
            rhs.negated = true;
            rhs.charset = charset;
            clear();
        }
        else // rhs.negated == true
        {
            CharT *iter = charset.ptr;
            CharT *end = iter + charset.length;
            CharT *rhs_iter = rhs.charset.ptr;
            CharT *rhs_end = rhs_iter + rhs.charset.length;
            while (iter != end && rhs_iter != rhs_end)
            {
                if (*iter < *rhs_iter)
                {
                    overlap.charset ~= *iter;
                    rhs.charset.length += 1;
                    rhs_iter = rhs.charset.ptr;
                    rhs_end = rhs_iter + rhs.charset.length;
                    memmove(rhs_iter + 1, rhs_iter, (rhs.charset.length -
                        (rhs_end - rhs_iter - 1)) * CharT.sizeof);
                    ++rhs_iter;
                    memmove(iter, iter + 1, (charset.ptr +
                        charset.length - iter) * CharT.sizeof);
                    charset.length -= 1;
                    --end;
                }
                else if (*iter > *rhs_iter)
                {
                    ++rhs_iter;
                }
                else
                {
                    ++iter;
                    ++rhs_iter;
                }
            }
            if (iter != end)
            {
                CharT[] temp;
                temp.length = end - iter;
                memmove(temp.ptr, iter, temp.length * CharT.sizeof);
                // nothing bigger in rhs than iter
                // src, dest
                merge(temp, overlap.charset);
                memmove(iter, iter + 1, (charset.ptr +
                    charset.length - iter) * CharT.sizeof);
                charset.length -= 1;
            }
            if (!overlap.charset.empty())
            {
                merge(overlap.charset, rhs.charset);
                // possible duplicates, so check for any and erase.
                squeeze(rhs.charset);
                normalise();
                overlap.normalise();
                rhs.normalise();
            }
        }
    }
    void squeeze(ref CharT[] str)
    {
        if (str.length > 1)
        {
            CharT *write = str.ptr;
            CharT *end = write + str.length;
            CharT *read = write + 1;
            while (read != end)
            {
                while (read != end && *read == *write)
                {
                    ++read;
                }
                if (read == end) break;
                ++write;
                if (read > write)
                {
                    *write = *read;
                }
                ++read;
            }
            str.length = write + 1 - str.ptr;
        }
    }
    void merge(ref CharT[] src, ref CharT[] dest)
    {
        CharT[] temp;
        CharT *ptr;
        CharT *iter = src.ptr;
        CharT *end = iter + src.length;
        CharT *dest_iter = dest.ptr;
        CharT *dest_end = dest_iter + dest.length;
        temp.length = src.length + dest.length;
        ptr = temp.ptr;
        while (iter != end && dest_iter != dest_end)
        {
            if (*iter < *dest_iter)
            {
                *ptr++ = *iter++;
            }
            else
            {
                *ptr++ = *dest_iter++;
            }
        }
        while (iter != end)
        {
            *ptr++ = *iter++;
        }
        while (dest_iter != dest_end)
        {
            *ptr++ = *dest_iter++;
        }
        dest = temp;
    }
};
}
int main(char[][]argv)
{
    regex!(char).BasicStringToken lhs;
    regex!(char).BasicStringToken rhs;
    regex!(char).BasicStringToken intersect;
    lhs.charset = "aaabbc".dup;
    lhs.negated = true;
    lhs.removeDuplicates();
    rhs.charset = "bccddd".dup;
    rhs.negated = true;
    rhs.removeDuplicates();
    writeln(lhs.charset, '(', lhs.negated, ") intersect ",
        rhs.charset, '(', rhs.negated, ") =");
    lhs.intersect(rhs, intersect);
    writeln(lhs.charset, '(', lhs.negated, "), ",
        rhs.charset, '(', rhs.negated, "), ",
        intersect.charset, '(', intersect.negated, ')');
    return 0;
}
    
    
More information about the Digitalmars-d
mailing list