Latest string_token Code

Rory McGuire rmcguire at neonova.co.za
Tue Jun 22 03:29:01 PDT 2010


I think sizeof is a property, e.g. char.sizeof or:

struct A {
	int a;
	char[10] b;
}


A.sizeof

A a;
a.sizeof

you get the picture. (Have I got it?)

-Rory



On Tue, 22 Jun 2010 12:07:37 +0200, Ben Hanson <Ben.Hanson at tfbplc.co.uk>  
wrote:

> Hi there,
>
> I've basically got the string_token class working now. Thanks to  
> everyone who
> helped. It still needs some work as memmove works with bytes so I need  
> the
> equivalent of 'sizeof' in D for this.
>
> 'squeeze' doesn't work with wide chars, so I will write my own version.
>
> When I shrink or grow char arrays, I'd like to know if I should re-set my
> pointers into them accordingly.
>
> If anyone can point out any other obvious issues, bad style etc. that  
> would be
> appreciated. Please bear in mind that I'd like the code to be as fast as  
> possible.
>
> Here's the source:
>
> Regards,
>
> Ben
>
> module main;
>
> import std.algorithm;
> import std.array;
> import std.c.string;
> import std.string;
>
> import std.stdio;
>
> template regex(CharT)
> {
> struct basic_string_token
> {
> 	bool _negated = false;
> 	CharT[] _charset;
> 	enum size_t MAX_CHARS = CharT.max + 1;
> 	enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;
>
> 	this(const bool negated_, ref CharT[] charset_)
> 	{
> 		_negated = negated_;
> 		_charset = charset_;
> 	}
>
> 	void remove_duplicates()
> 	{
> 		_charset.sort;
> 		_charset = squeeze(_charset.idup).dup;
> 	}
>
> 	void normalise()
> 	{
> 		if (_charset.length == MAX_CHARS)
> 		{
> 			_negated = !_negated;
> 			_charset.clear();
> 		}
> 		else if (_charset.length > MAX_CHARS / 2)
> 		{
> 			negate();
> 		}
> 	}
>
> 	void negate()
> 	{
> 		CharT curr_char_ = START_CHAR;
> 		CharT[] temp_;
> 		CharT *ptr_;
> 		CharT *curr_ = _charset.ptr;
> 		CharT *end_ = curr_ + _charset.length;
> 		size_t i_ = 0;
>
> 		_negated = !_negated;
> 		temp_.length = MAX_CHARS - _charset.length;
> 		ptr_ = temp_.ptr;
>
> 		while (curr_ < end_)
> 		{
> 			while (*curr_ > curr_char_)
> 			{
> 				*ptr_ = curr_char_;
> 				++ptr_;
> 				++curr_char_;
> 				++i_;
> 			}
>
> 			++curr_char_;
> 			++curr_;
> 			++i_;
> 		}
>
> 		for (; i_ < MAX_CHARS; ++i_)
> 		{
> 			*ptr_ = curr_char_;
> 			++ptr_;
> 			++curr_char_;
> 		}
>
> 		_charset = temp_;
> 	}
>
> 	bool empty()
> 	{
> 		return _charset.length == 0 && !_negated;
> 	}
>
> 	bool any()
> 	{
> 		return _charset.length == 0 && _negated;
> 	}
>
> 	void clear()
> 	{
> 		_negated = false;
> 		_charset.length = 0;
> 	}
>
> 	void intersect(ref basic_string_token rhs_,
> 		ref basic_string_token overlap_)
> 	{
> 		if ((any() && rhs_.any()) || (_negated == rhs_._negated &&
> 			!any() && !rhs_.any()))
> 		{
> 			intersect_same_types(rhs_, overlap_);
> 		}
> 		else
> 		{
> 			intersect_diff_types(rhs_, overlap_);
> 		}
> 	}
>
> private:
> 	void intersect_same_types(ref basic_string_token rhs_,
> 		ref basic_string_token overlap_)
> 	{
> 		if (any())
> 		{
> 			clear();
> 			overlap_._negated = true;
> 			rhs_.clear();
> 		}
> 		else
> 		{
> 			CharT *iter_ = _charset.ptr;
> 			CharT *end_ = iter_ + _charset.length;
> 			CharT *rhs_iter_ = rhs_._charset.ptr;
> 			CharT *rhs_end_ = rhs_iter_ + rhs_._charset.length;
>
> 			overlap_._negated = _negated;
>
> 			while (iter_ != end_ && rhs_iter_ != rhs_end_)
> 			{
> 				if (*iter_ < *rhs_iter_)
> 				{
> 					++iter_;
> 				}
> 				else if (*iter_ > *rhs_iter_)
> 				{
> 					++rhs_iter_;
> 				}
> 				else
> 				{
> 					overlap_._charset ~= *iter_;
> 					memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
> 					--end_;
> 					_charset.length -= 1;
> 					memmove(rhs_iter_, rhs_iter_ + 1, rhs_._charset.ptr +
> 						rhs_._charset.length - rhs_iter_);
> 					--rhs_end_;
> 					rhs_._charset.length -= 1;
> 				}
> 			}
>
> 			if (_negated)
> 			{
> 				// duplicates already merged
> 				// src, dest
> 				merge(_charset, overlap_._charset);
> 				// duplicates already merged
> 				// src, dest
> 				merge(rhs_._charset, overlap_._charset);
> 				_negated = false;
> 				rhs_._negated = false;
> 				swap(_charset, rhs_._charset);
> 				normalise();
> 				overlap_.normalise();
> 				rhs_.normalise();
> 			}
> 			else if (!overlap_._charset.length == 0)
> 			{
> 				normalise();
> 				overlap_.normalise();
> 				rhs_.normalise();
> 			}
> 		}
> 	}
>
> 	void intersect_diff_types(ref basic_string_token rhs_,
> 		ref basic_string_token overlap_)
> 	{
> 		if (any())
> 		{
> 			intersect_any(rhs_, overlap_);
> 		}
> 		else if (_negated)
> 		{
> 			intersect_negated(rhs_, overlap_);
> 		}
> 		else // _negated == false
> 		{
> 			intersect_charset(rhs_, overlap_);
> 		}
> 	}
>
> 	void intersect_any(ref basic_string_token rhs_, ref basic_string_token  
> overlap_)
> 	{
> 		if (rhs_._negated)
> 		{
> 			rhs_.intersect_negated(this, overlap_);
> 		}
> 		else // rhs._negated == false
> 		{
> 			rhs_.intersect_charset(this, overlap_);
> 		}
> 	}
>
> 	void intersect_negated(ref basic_string_token rhs_,
> 		ref basic_string_token overlap_)
> 	{
> 		if (rhs_.any())
> 		{
> 			overlap_._negated = true;
> 			overlap_._charset = _charset;
> 			rhs_._negated = false;
> 			rhs_._charset = _charset;
> 			clear();
> 		}
> 		else // rhs._negated == false
> 		{
> 			rhs_.intersect_charset(this, overlap_);
> 		}
> 	}
>
> 	void intersect_charset(ref basic_string_token rhs_,
> 		ref basic_string_token overlap_)
> 	{
> 		if (rhs_.any())
> 		{
> 			overlap_._charset = _charset;
> 			rhs_._negated = true;
> 			rhs_._charset = _charset;
> 			clear();
> 		}
> 		else // rhs_._negated == true
> 		{
> 			CharT *iter_ = _charset.ptr;
> 			CharT *end_ = iter_ + _charset.length;
> 			CharT *rhs_iter_ = rhs_._charset.ptr;
> 			CharT *rhs_end_ = rhs_iter_ + rhs_._charset.length;
>
> 			while (iter_ != end_ && rhs_iter_ != rhs_end_)
> 			{
> 				if (*iter_ < *rhs_iter_)
> 				{
> 					overlap_._charset ~= *iter_;
> 					rhs_._charset.length += 1;
> 					rhs_iter_ = rhs_._charset.ptr;
> 					rhs_end_ = rhs_iter_ + rhs_._charset.length;
> 					memmove(rhs_iter_ + 1, rhs_iter_, rhs_._charset.length -
> 						(rhs_end_ - rhs_iter_ - 1));
> 					++rhs_iter_;
> 					memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
> 					_charset.length -= 1;
> 					--end_;
> 				}
> 				else if (*iter_ > *rhs_iter_)
> 				{
> 					++rhs_iter_;
> 				}
> 				else
> 				{
> 					++iter_;
> 					++rhs_iter_;
> 				}
> 			}
>
> 			if (iter_ != end_)
> 			{
> 				CharT[] temp_;
>
> 				temp_.length = end_ - iter_;
> 				memmove(temp_.ptr, iter_, temp_.length);
>
> 				// nothing bigger in rhs_ than iter_
> 				// src, dest
> 				merge(temp_, overlap_._charset);
> 				memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
> 				_charset.length -= 1;
> 			}
>
> 			if (!overlap_._charset.empty())
> 			{
> 				merge(overlap_._charset, rhs_._charset);
> 				// possible duplicates, so check for any and erase.
> 				rhs_._charset = squeeze(rhs_._charset.idup).dup;
> 				normalise();
> 				overlap_.normalise();
> 				rhs_.normalise();
> 			}
> 		}
> 	}
>
> 	void merge(ref CharT[] src_, ref CharT[] dest_)
> 	{
> 		CharT[] temp_;
> 		CharT *ptr_;
> 		CharT *iter_ = src_.ptr;
> 		CharT *end_ = iter_ + src_.length;
> 		CharT *dest_iter_ = dest_.ptr;
> 		CharT *dest_end_ = dest_iter_ + dest_.length;
>
> 		temp_.length = src_.length + dest_.length;
> 		ptr_ = temp_.ptr;
>
> 		while (iter_ != end_ && dest_iter_ != dest_end_)
> 		{
> 			if (*iter_ < *dest_iter_)
> 			{
> 				*ptr_++ = *iter_++;
> 			}
> 			else
> 			{
> 				*ptr_++ = *dest_iter_++;
> 			}
> 		}
>
> 		while (iter_ != end_)
> 		{
> 			*ptr_++ = *iter_++;
> 		}
>
> 		while (dest_iter_ != dest_end_)
> 		{
> 			*ptr_++ = *dest_iter_++;
> 		}
>
> 		dest_ = temp_;
> 	}
> };
> }
>
> int main(char[][]argv)
> {
> 	regex!(char).basic_string_token lhs_;
> 	regex!(char).basic_string_token rhs_;
> 	regex!(char).basic_string_token intersect_;
>
> 	lhs_._charset = "abc".dup;
> 	lhs_._negated = true;
> 	rhs_._charset = "bcd".dup;
> 	rhs_._negated = true;
> 	writeln(lhs_._charset, '(', lhs_._negated, ") intersect ",
> 		rhs_._charset, '(', rhs_._negated, ") =");
> 	lhs_.intersect(rhs_, intersect_);
> 	writeln(lhs_._charset, '(', lhs_._negated, "), ",
> 		rhs_._charset, '(', rhs_._negated, "), ",
> 		intersect_._charset, '(', intersect_._negated, ')');
> 	return 0;
> }


More information about the Digitalmars-d mailing list