Latest string_token Code
Rory McGuire
rmcguire at neonova.co.za
Tue Jun 22 03:29:01 PDT 2010
I think sizeof is a property, e.g. char.sizeof or:
struct A {
int a;
char[10] b;
}
A.sizeof
A a;
a.sizeof
you get the picture. (Have I got it?)
-Rory
On Tue, 22 Jun 2010 12:07:37 +0200, Ben Hanson <Ben.Hanson at tfbplc.co.uk>
wrote:
> Hi there,
>
> I've basically got the string_token class working now. Thanks to
> everyone who
> helped. It still needs some work as memmove works with bytes so I need
> the
> equivalent of 'sizeof' in D for this.
>
> 'squeeze' doesn't work with wide chars, so I will write my own version.
>
> When I shrink or grow char arrays, I'd like to know if I should re-set my
> pointers into them accordingly.
>
> If anyone can point out any other obvious issues, bad style etc. that
> would be
> appreciated. Please bear in mind that I'd like the code to be as fast as
> possible.
>
> Here's the source:
>
> Regards,
>
> Ben
>
> module main;
>
> import std.algorithm;
> import std.array;
> import std.c.string;
> import std.string;
>
> import std.stdio;
>
> template regex(CharT)
> {
> struct basic_string_token
> {
> bool _negated = false;
> CharT[] _charset;
> enum size_t MAX_CHARS = CharT.max + 1;
> enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
>
> this(const bool negated_, ref CharT[] charset_)
> {
> _negated = negated_;
> _charset = charset_;
> }
>
> void remove_duplicates()
> {
> _charset.sort;
> _charset = squeeze(_charset.idup).dup;
> }
>
> void normalise()
> {
> if (_charset.length == MAX_CHARS)
> {
> _negated = !_negated;
> _charset.clear();
> }
> else if (_charset.length > MAX_CHARS / 2)
> {
> negate();
> }
> }
>
> void negate()
> {
> CharT curr_char_ = START_CHAR;
> CharT[] temp_;
> CharT *ptr_;
> CharT *curr_ = _charset.ptr;
> CharT *end_ = curr_ + _charset.length;
> size_t i_ = 0;
>
> _negated = !_negated;
> temp_.length = MAX_CHARS - _charset.length;
> ptr_ = temp_.ptr;
>
> while (curr_ < end_)
> {
> while (*curr_ > curr_char_)
> {
> *ptr_ = curr_char_;
> ++ptr_;
> ++curr_char_;
> ++i_;
> }
>
> ++curr_char_;
> ++curr_;
> ++i_;
> }
>
> for (; i_ < MAX_CHARS; ++i_)
> {
> *ptr_ = curr_char_;
> ++ptr_;
> ++curr_char_;
> }
>
> _charset = temp_;
> }
>
> bool empty()
> {
> return _charset.length == 0 && !_negated;
> }
>
> bool any()
> {
> return _charset.length == 0 && _negated;
> }
>
> void clear()
> {
> _negated = false;
> _charset.length = 0;
> }
>
> void intersect(ref basic_string_token rhs_,
> ref basic_string_token overlap_)
> {
> if ((any() && rhs_.any()) || (_negated == rhs_._negated &&
> !any() && !rhs_.any()))
> {
> intersect_same_types(rhs_, overlap_);
> }
> else
> {
> intersect_diff_types(rhs_, overlap_);
> }
> }
>
> private:
> void intersect_same_types(ref basic_string_token rhs_,
> ref basic_string_token overlap_)
> {
> if (any())
> {
> clear();
> overlap_._negated = true;
> rhs_.clear();
> }
> else
> {
> CharT *iter_ = _charset.ptr;
> CharT *end_ = iter_ + _charset.length;
> CharT *rhs_iter_ = rhs_._charset.ptr;
> CharT *rhs_end_ = rhs_iter_ + rhs_._charset.length;
>
> overlap_._negated = _negated;
>
> while (iter_ != end_ && rhs_iter_ != rhs_end_)
> {
> if (*iter_ < *rhs_iter_)
> {
> ++iter_;
> }
> else if (*iter_ > *rhs_iter_)
> {
> ++rhs_iter_;
> }
> else
> {
> overlap_._charset ~= *iter_;
> memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
> --end_;
> _charset.length -= 1;
> memmove(rhs_iter_, rhs_iter_ + 1, rhs_._charset.ptr +
> rhs_._charset.length - rhs_iter_);
> --rhs_end_;
> rhs_._charset.length -= 1;
> }
> }
>
> if (_negated)
> {
> // duplicates already merged
> // src, dest
> merge(_charset, overlap_._charset);
> // duplicates already merged
> // src, dest
> merge(rhs_._charset, overlap_._charset);
> _negated = false;
> rhs_._negated = false;
> swap(_charset, rhs_._charset);
> normalise();
> overlap_.normalise();
> rhs_.normalise();
> }
> else if (!overlap_._charset.length == 0)
> {
> normalise();
> overlap_.normalise();
> rhs_.normalise();
> }
> }
> }
>
> void intersect_diff_types(ref basic_string_token rhs_,
> ref basic_string_token overlap_)
> {
> if (any())
> {
> intersect_any(rhs_, overlap_);
> }
> else if (_negated)
> {
> intersect_negated(rhs_, overlap_);
> }
> else // _negated == false
> {
> intersect_charset(rhs_, overlap_);
> }
> }
>
> void intersect_any(ref basic_string_token rhs_, ref basic_string_token
> overlap_)
> {
> if (rhs_._negated)
> {
> rhs_.intersect_negated(this, overlap_);
> }
> else // rhs._negated == false
> {
> rhs_.intersect_charset(this, overlap_);
> }
> }
>
> void intersect_negated(ref basic_string_token rhs_,
> ref basic_string_token overlap_)
> {
> if (rhs_.any())
> {
> overlap_._negated = true;
> overlap_._charset = _charset;
> rhs_._negated = false;
> rhs_._charset = _charset;
> clear();
> }
> else // rhs._negated == false
> {
> rhs_.intersect_charset(this, overlap_);
> }
> }
>
> void intersect_charset(ref basic_string_token rhs_,
> ref basic_string_token overlap_)
> {
> if (rhs_.any())
> {
> overlap_._charset = _charset;
> rhs_._negated = true;
> rhs_._charset = _charset;
> clear();
> }
> else // rhs_._negated == true
> {
> CharT *iter_ = _charset.ptr;
> CharT *end_ = iter_ + _charset.length;
> CharT *rhs_iter_ = rhs_._charset.ptr;
> CharT *rhs_end_ = rhs_iter_ + rhs_._charset.length;
>
> while (iter_ != end_ && rhs_iter_ != rhs_end_)
> {
> if (*iter_ < *rhs_iter_)
> {
> overlap_._charset ~= *iter_;
> rhs_._charset.length += 1;
> rhs_iter_ = rhs_._charset.ptr;
> rhs_end_ = rhs_iter_ + rhs_._charset.length;
> memmove(rhs_iter_ + 1, rhs_iter_, rhs_._charset.length -
> (rhs_end_ - rhs_iter_ - 1));
> ++rhs_iter_;
> memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
> _charset.length -= 1;
> --end_;
> }
> else if (*iter_ > *rhs_iter_)
> {
> ++rhs_iter_;
> }
> else
> {
> ++iter_;
> ++rhs_iter_;
> }
> }
>
> if (iter_ != end_)
> {
> CharT[] temp_;
>
> temp_.length = end_ - iter_;
> memmove(temp_.ptr, iter_, temp_.length);
>
> // nothing bigger in rhs_ than iter_
> // src, dest
> merge(temp_, overlap_._charset);
> memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
> _charset.length -= 1;
> }
>
> if (!overlap_._charset.empty())
> {
> merge(overlap_._charset, rhs_._charset);
> // possible duplicates, so check for any and erase.
> rhs_._charset = squeeze(rhs_._charset.idup).dup;
> normalise();
> overlap_.normalise();
> rhs_.normalise();
> }
> }
> }
>
> void merge(ref CharT[] src_, ref CharT[] dest_)
> {
> CharT[] temp_;
> CharT *ptr_;
> CharT *iter_ = src_.ptr;
> CharT *end_ = iter_ + src_.length;
> CharT *dest_iter_ = dest_.ptr;
> CharT *dest_end_ = dest_iter_ + dest_.length;
>
> temp_.length = src_.length + dest_.length;
> ptr_ = temp_.ptr;
>
> while (iter_ != end_ && dest_iter_ != dest_end_)
> {
> if (*iter_ < *dest_iter_)
> {
> *ptr_++ = *iter_++;
> }
> else
> {
> *ptr_++ = *dest_iter_++;
> }
> }
>
> while (iter_ != end_)
> {
> *ptr_++ = *iter_++;
> }
>
> while (dest_iter_ != dest_end_)
> {
> *ptr_++ = *dest_iter_++;
> }
>
> dest_ = temp_;
> }
> };
> }
>
> int main(char[][]argv)
> {
> regex!(char).basic_string_token lhs_;
> regex!(char).basic_string_token rhs_;
> regex!(char).basic_string_token intersect_;
>
> lhs_._charset = "abc".dup;
> lhs_._negated = true;
> rhs_._charset = "bcd".dup;
> rhs_._negated = true;
> writeln(lhs_._charset, '(', lhs_._negated, ") intersect ",
> rhs_._charset, '(', rhs_._negated, ") =");
> lhs_.intersect(rhs_, intersect_);
> writeln(lhs_._charset, '(', lhs_._negated, "), ",
> rhs_._charset, '(', rhs_._negated, "), ",
> intersect_._charset, '(', intersect_._negated, ')');
> return 0;
> }
More information about the Digitalmars-d
mailing list