[review] new string type (take 2)
Steven Schveighoffer
schveiguy at yahoo.com
Thu Jan 13 08:24:37 PST 2011
Based on the suggestions of others, I have updated my string type.
changes:
* opApply for iterating with foreach w/ index
* indexing the string at an invalid location (i.e. not the start of a code
point) throws a RangeError (does that make sense, or should it be an
exception?).
* charStart is now public so you can use it to ensure you are accessing
the start of a code point
* validIdx new function that tells you if your index is at the start of a
code point.
* data property which gets the underlying T[]
* Added free functions for string_t!dchar to make it have the same
properties as the other string types.
* Added an ability to assign to a T[] array for ease of use.
* Added a ptr property so it works seamlessly with code that currently
uses strings (but we still need $, however it appears this isn't
implemented yet in dmd).
* fully documented
Here it is:
// Written in the D programming language.
/**
Copyright: Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010
License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License
1.0</a>.
Authors: $(WEB erdani.org, Andrei Alexandrescu, Steven Schveighoffer)
Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010.
Distributed under the Boost Software License, Version 1.0.
(See accompanying file LICENSE_1_0.txt or copy at
http://www.boost.org/LICENSE_1_0.txt)
*/
import std.utf;
import std.traits;
import core.exception;
// BUG
version = norangeopapplyoverload;
/**
* Narrow string type. This string implements utf-8 or utf-16 depending
on the
* character type.
*
* The string type is a bi-directional range of dchars, with a monotonic
* indexing scheme. Essentially, because dchars are encoded at variable
* widths, indexing and slicing based on dchars would be an O(n) operation.
* Therefore, we allow indexing and slicing, but based on the 'code-unit'
or
* unit of encoding (wchar or char). This means some indexes are valid and
* some are not (those which do not point to the start of an encoded dchar
are
* not).
*
* While this might seem confusing, it is very rare that one needs
arbitrary
* index access. In order to achieve this you can either ensure to
yourself
* that the string's code-unit to dchar ratio is 1 to 1 (never more than
one
* code-unit to encode a dchar), or you can use the charStart funciton to
* ensure a valid index.
*/
struct string_t(T) if (is(Unqual!T == char) || is(Unqual!T == wchar))
{
private T[] _data;
/**
* Constructor, builds a string based on given array data.
*/
this(T[] data)
{
this._data = data;
}
/**
* Provide access to underlying array data.
*/
@property T[] data()
{
return _data;
}
/**
* forward access to the ptr part of the array. This allows the most
* efficient operations when one knows what he is doing.
*/
@property T* ptr()
{
return _data.ptr;
}
/**
* Finds the largest valid index in the string that is <= idx.
* Essentially, this can be used to convert arbitrary indexes into
valid
* indexes.
*/
size_t charStart(size_t idx) const
{
static if(is(Unqual!T == wchar))
{
immutable c = _data.ptr[idx];
if(c >= 0xDC00 && c <= 0xDFFF)
{
// surrogate pair detected, verify we have at least 2
wchars,
// and that both wchars are properly encoded.
assert(idx > 0, "Invalid UTF character at beginning of
string");
return idx-1;
}
else
return idx;
}
else
{
const p = _data.ptr + idx;
if ((p[0] & 0b1100_0000) != 0b1000_0000)
{
return idx;
}
else if (idx >= 1 && (p[-1] & 0b1100_0000) != 0b1000_0000)
{
return idx - 1;
}
else if (idx >= 2 && (p[-2] & 0b1100_0000) != 0b1000_0000)
{
return idx - 2;
}
else if (idx >= 3 && (p[-3] & 0b1100_0000) != 0b1000_0000)
{
return idx - 3;
}
else
{
assert(false, "Invalid UTF character in string");
}
}
}
/**
* Returns true if the given index starts an encoded dchar.
*/
bool validIdx(size_t idx)
{
if(idx >= _data.length)
{
if(idx is _data.length)
return true; // index one beyond the string is valid.
return false;
}
immutable c = _data[idx];
static if(is(Unqual!T == wchar))
{
// make sure this isn't the second character of a surrogate
pair
return (c < 0xDC00 || c > 0xDFFF);
}
else // char
{
return ((c & 0b1100_0000) != 0b1000_0000);
}
}
/**
* remove the first code-point from the string.
*/
void popFront()
{
auto nc = std.utf.stride(_data, 0);
assert(nc <= _data.length && nc != 0xFF, "Invalid sequence at
beginning of string");
_data = _data[nc .. $];
}
/**
* Remove the last code-point from the string.
*/
void popBack()
{
immutable n = _data.length;
assert(n, "Attempting to pop back of an empty string");
_data = _data.ptr[0..charStart(n-1)];
}
/**
* Get the first code-point in the string
*/
@property dchar front() const
{
assert(_data.length, "Attempting to fetch the front of an empty
string");
size_t i = 0;
return decode(_data, i);
}
/**
* Get the last code-point in the string
*/
@property dchar back() const
{
immutable n = _data.length;
assert(n, "Attempting to fetch the back of an empty string");
auto idx = charStart(n-1);
return std.utf.decode(_data, idx);
}
/**
* Does the string contain any data?
*/
@property bool empty() const
{
return !_data.length;
}
/**
* Copy the string (trivial function, needed for range definitions)
*/
@property typeof(this) save()
{
return this;
}
/**
* support read-only random access via code unit index.
*
* Note that an invalid idx (one that does not start a code point)
results
* in an exception
*/
dchar opIndex(size_t idx)
{
if(idx is _data.length || !validIdx(idx))
throw new RangeError(__FILE__, __LINE__);
return std.utf.decode(_data, idx);
}
/**
* slice the whole string
*/
string_t opSlice()
{
return this;
}
/**
* Slice based on valid start and end indexes.
*
* Throws RangeError if start or end are not valid indexes
*/
string_t opSlice(size_t start, size_t end)
{
if(end < start || !validIdx(start) || !validIdx(end))
throw new RangeError(__FILE__, __LINE__);
return string_t(_data[start..end]);
}
/**
* Get the number of code units in the string.
*
* Note that this is specifically not called length because length
* generally implies the number of elements in a range. Since dchar
is our
* element type, and the number of dchars cannot be determined in O(1)
* time, using the name length would be incorrect.
*/
@property size_t codeUnits() const
{
return _data.length;
}
/**
* Append a string to this string.
*
* TODO: support appending any string type to this string type.
*/
ref string_t opOpAssign(string op, U)(U data) if (op == "~" && is(U ==
string_t))
{
_data ~= data._data;
return this;
}
/**
* Support appending any types that the underlying array can support.
*/
ref string_t opOpAssign(string op, U)(U data) if (op == "~" && !is(U
== string_t) && is(typeof(_data ~= U.init)))
{
_data ~= data;
return this;
}
/**
* Concatenation between two strings
*/
string_t opBinary(string op, U)(U data) if (op == "~" && is(U ==
string_t))
{
return string_t(_data ~ data._data);
}
/**
* Support any concatenation that is supported by the underlying data
* array.
*/
string_t opBinary(string op, U)(U data) if (op == "~" && !is(U ==
string_t) && is(typeof(_data ~ U.init)))
{
return string_t(_data ~ data);
}
// note, this should not be required, it should use the range
interface but
// the compiler doesn't allow both to coexist for foreach.
version(norangeopapplyoverload)
{
/**
* Foreach with just dchars. Note, you cannot actually change the
* data, despite the argument being ref. opApply requires ref.
*/
int opApply(scope int delegate(ref dchar d) dg)
{
dchar d;
size_t idx = 0;
immutable len = _data.length;
int result = 0;
while(result == 0 && idx < len)
{
d = std.utf.decode(_data, idx);
result = dg(d);
}
return result;
}
}
/**
* Foreach over the string with accompanied index.
*
* Note, the refs are required for foreach, you cannot change the
index or
* the data in the string.
*/
int opApply(scope int delegate(ref size_t idx, ref dchar d) dg)
{
dchar d;
size_t idx = 0;
immutable len = _data.length;
int result = 0;
while(result == 0 && idx < len)
{
size_t tmpidx = idx;
d = std.utf.decode(_data, idx);
result = dg(tmpidx, d);
}
return result;
}
/**
* Assign a string
*/
string_t opAssign(U)(U u) if(is(U == string_t))
{
this._data = u._data;
return this;
}
/**
* Assign a string from another type
*/
string_t opAssign(U)(U u) if (!is(U == string_t) && is(typeof(_data =
u)))
{
_data = u;
return this;
}
}
/**
* String type for dchar
*/
template string_t(T) if (is(Unqual!T == dchar))
{
alias T[] string_t;
}
// support string functions for dchar that aren't already defined.
// TODO: do we need this one?
// TODO: should be inout instead of a template
@property T[] data(T)(T[] t) if (is(Unqual!T == dchar))
{
return t;
}
/**
* Finds the largest valid index in the string that is <= idx.
* Essentially, this can be used to convert arbitrary indexes into valid
* indexes.
*/
size_t charStart(const(dchar)[] t, size_t idx)
{
return idx;
}
/**
* Returns true if the given index starts an encoded dchar.
*/
bool validIdx(const(dchar)[] t, size_t idx)
{
return idx <= t.length;
}
// TODO: do we need this one?
@property size_t codeUnits(const(dchar)[] t)
{
return t.length;
}
/** begin test code **/
import std.stdio;
alias string_t!(immutable char) mystring;
alias string_t!(immutable wchar) mywstring;
alias string_t!(immutable dchar) mydstring;
void main()
{
auto str = mystring("hello");
foreach(dchar d; str) { }
str ~= " world";
str ~= mystring("!!!");
writeln(str.data);
mystring str2 = "blah blah";
str2 = str;
str2 = "blah blah";
}
More information about the Digitalmars-d
mailing list