Latest string_token Code

Ben Hanson Ben.Hanson at tfbplc.co.uk
Tue Jun 22 03:07:37 PDT 2010


Hi there,

I've basically got the string_token class working now. Thanks to everyone who
helped. It still needs some work as memmove works with bytes so I need the
equivalent of 'sizeof' in D for this.

'squeeze' doesn't work with wide chars, so I will write my own version.

When I shrink or grow char arrays, I'd like to know if I should re-set my
pointers into them accordingly.

If anyone can point out any other obvious issues, bad style etc. that would be
appreciated. Please bear in mind that I'd like the code to be as fast as possible.

Here's the source:

Regards,

Ben

module main;

import std.algorithm;
import std.array;
import std.c.string;
import std.string;

import std.stdio;

template regex(CharT)
{
struct basic_string_token
{
	bool _negated = false;
	CharT[] _charset;
	enum size_t MAX_CHARS = CharT.max + 1;
	enum size_t START_CHAR = cast(CharT) 0x80 < 0 ? 0x80 : 0;

	this(const bool negated_, ref CharT[] charset_)
	{
		_negated = negated_;
		_charset = charset_;
	}

	void remove_duplicates()
	{
		_charset.sort;
		_charset = squeeze(_charset.idup).dup;
	}

	void normalise()
	{
		if (_charset.length == MAX_CHARS)
		{
			_negated = !_negated;
			_charset.clear();
		}
		else if (_charset.length > MAX_CHARS / 2)
		{
			negate();
		}
	}

	void negate()
	{
		CharT curr_char_ = START_CHAR;
		CharT[] temp_;
		CharT *ptr_;
		CharT *curr_ = _charset.ptr;
		CharT *end_ = curr_ + _charset.length;
		size_t i_ = 0;

		_negated = !_negated;
		temp_.length = MAX_CHARS - _charset.length;
		ptr_ = temp_.ptr;

		while (curr_ < end_)
		{
			while (*curr_ > curr_char_)
			{
				*ptr_ = curr_char_;
				++ptr_;
				++curr_char_;
				++i_;
			}

			++curr_char_;
			++curr_;
			++i_;
		}

		for (; i_ < MAX_CHARS; ++i_)
		{
			*ptr_ = curr_char_;
			++ptr_;
			++curr_char_;
		}

		_charset = temp_;
	}

	bool empty()
	{
		return _charset.length == 0 && !_negated;
	}

	bool any()
	{
		return _charset.length == 0 && _negated;
	}

	void clear()
	{
		_negated = false;
		_charset.length = 0;
	}

	void intersect(ref basic_string_token rhs_,
		ref basic_string_token overlap_)
	{
		if ((any() && rhs_.any()) || (_negated == rhs_._negated &&
			!any() && !rhs_.any()))
		{
			intersect_same_types(rhs_, overlap_);
		}
		else
		{
			intersect_diff_types(rhs_, overlap_);
		}
	}

private:
	void intersect_same_types(ref basic_string_token rhs_,
		ref basic_string_token overlap_)
	{
		if (any())
		{
			clear();
			overlap_._negated = true;
			rhs_.clear();
		}
		else
		{
			CharT *iter_ = _charset.ptr;
			CharT *end_ = iter_ + _charset.length;
			CharT *rhs_iter_ = rhs_._charset.ptr;
			CharT *rhs_end_ = rhs_iter_ + rhs_._charset.length;

			overlap_._negated = _negated;

			while (iter_ != end_ && rhs_iter_ != rhs_end_)
			{
				if (*iter_ < *rhs_iter_)
				{
					++iter_;
				}
				else if (*iter_ > *rhs_iter_)
				{
					++rhs_iter_;
				}
				else
				{
					overlap_._charset ~= *iter_;
					memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
					--end_;
					_charset.length -= 1;
					memmove(rhs_iter_, rhs_iter_ + 1, rhs_._charset.ptr +
						rhs_._charset.length - rhs_iter_);
					--rhs_end_;
					rhs_._charset.length -= 1;
				}
			}

			if (_negated)
			{
				// duplicates already merged
				// src, dest
				merge(_charset, overlap_._charset);
				// duplicates already merged
				// src, dest
				merge(rhs_._charset, overlap_._charset);
				_negated = false;
				rhs_._negated = false;
				swap(_charset, rhs_._charset);
				normalise();
				overlap_.normalise();
				rhs_.normalise();
			}
			else if (!overlap_._charset.length == 0)
			{
				normalise();
				overlap_.normalise();
				rhs_.normalise();
			}
		}
	}

	void intersect_diff_types(ref basic_string_token rhs_,
		ref basic_string_token overlap_)
	{
		if (any())
		{
			intersect_any(rhs_, overlap_);
		}
		else if (_negated)
		{
			intersect_negated(rhs_, overlap_);
		}
		else // _negated == false
		{
			intersect_charset(rhs_, overlap_);
		}
	}

	void intersect_any(ref basic_string_token rhs_, ref basic_string_token overlap_)
	{
		if (rhs_._negated)
		{
			rhs_.intersect_negated(this, overlap_);
		}
		else // rhs._negated == false
		{
			rhs_.intersect_charset(this, overlap_);
		}
	}

	void intersect_negated(ref basic_string_token rhs_,
		ref basic_string_token overlap_)
	{
		if (rhs_.any())
		{
			overlap_._negated = true;
			overlap_._charset = _charset;
			rhs_._negated = false;
			rhs_._charset = _charset;
			clear();
		}
		else // rhs._negated == false
		{
			rhs_.intersect_charset(this, overlap_);
		}
	}

	void intersect_charset(ref basic_string_token rhs_,
		ref basic_string_token overlap_)
	{
		if (rhs_.any())
		{
			overlap_._charset = _charset;
			rhs_._negated = true;
			rhs_._charset = _charset;
			clear();
		}
		else // rhs_._negated == true
		{
			CharT *iter_ = _charset.ptr;
			CharT *end_ = iter_ + _charset.length;
			CharT *rhs_iter_ = rhs_._charset.ptr;
			CharT *rhs_end_ = rhs_iter_ + rhs_._charset.length;

			while (iter_ != end_ && rhs_iter_ != rhs_end_)
			{
				if (*iter_ < *rhs_iter_)
				{
					overlap_._charset ~= *iter_;
					rhs_._charset.length += 1;
					rhs_iter_ = rhs_._charset.ptr;
					rhs_end_ = rhs_iter_ + rhs_._charset.length;
					memmove(rhs_iter_ + 1, rhs_iter_, rhs_._charset.length -
						(rhs_end_ - rhs_iter_ - 1));
					++rhs_iter_;
					memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
					_charset.length -= 1;
					--end_;
				}
				else if (*iter_ > *rhs_iter_)
				{
					++rhs_iter_;
				}
				else
				{
					++iter_;
					++rhs_iter_;
				}
			}

			if (iter_ != end_)
			{
				CharT[] temp_;

				temp_.length = end_ - iter_;
				memmove(temp_.ptr, iter_, temp_.length);

				// nothing bigger in rhs_ than iter_
				// src, dest
				merge(temp_, overlap_._charset);
				memmove(iter_, iter_ + 1, _charset.ptr + _charset.length - iter_);
				_charset.length -= 1;
			}

			if (!overlap_._charset.empty())
			{
				merge(overlap_._charset, rhs_._charset);
				// possible duplicates, so check for any and erase.
				rhs_._charset = squeeze(rhs_._charset.idup).dup;
				normalise();
				overlap_.normalise();
				rhs_.normalise();
			}
		}
	}

	void merge(ref CharT[] src_, ref CharT[] dest_)
	{
		CharT[] temp_;
		CharT *ptr_;
		CharT *iter_ = src_.ptr;
		CharT *end_ = iter_ + src_.length;
		CharT *dest_iter_ = dest_.ptr;
		CharT *dest_end_ = dest_iter_ + dest_.length;

		temp_.length = src_.length + dest_.length;
		ptr_ = temp_.ptr;

		while (iter_ != end_ && dest_iter_ != dest_end_)
		{
			if (*iter_ < *dest_iter_)
			{
				*ptr_++ = *iter_++;
			}
			else
			{
				*ptr_++ = *dest_iter_++;
			}
		}

		while (iter_ != end_)
		{
			*ptr_++ = *iter_++;
		}

		while (dest_iter_ != dest_end_)
		{
			*ptr_++ = *dest_iter_++;
		}

		dest_ = temp_;
	}
};
}

int main(char[][]argv)
{
	regex!(char).basic_string_token lhs_;
	regex!(char).basic_string_token rhs_;
	regex!(char).basic_string_token intersect_;

	lhs_._charset = "abc".dup;
	lhs_._negated = true;
	rhs_._charset = "bcd".dup;
	rhs_._negated = true;
	writeln(lhs_._charset, '(', lhs_._negated, ") intersect ",
		rhs_._charset, '(', rhs_._negated, ") =");
	lhs_.intersect(rhs_, intersect_);
	writeln(lhs_._charset, '(', lhs_._negated, "), ",
		rhs_._charset, '(', rhs_._negated, "), ",
		intersect_._charset, '(', intersect_._negated, ')');
	return 0;
}


More information about the Digitalmars-d mailing list