[review] new string type

Wed Jan 12 01:49:26 PST 2011

I like the direction you're taking but have some quibbles about details. 
  Specifically, I'd go for a more complete separation into random-access 
code-unit ranges and bidirectional code-point ranges:

On 01/12/10 02:18, Steven Schveighoffer wrote:
>
> // Written in the D programming language.
>
> /**
> Copyright: Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010
> License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License
> 1.0</a>.
> Authors: $(WEB erdani.org, Andrei Alexandrescu, Steven Schveighoffer)
>
> Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010.
> Distributed under the Boost Software License, Version 1.0.
> (See accompanying file LICENSE_1_0.txt or copy at
> http://www.boost.org/LICENSE_1_0.txt)
> */
> import std.utf;
> import std.traits;
>
> struct string_t(T) if (is(Unqual!T == char) || is(Unqual!T == wchar))

Is there a reason not to include is(Unqual!T == dchar)?

> {
> private T[] _data;
> this(T[] data)
> {
> this._data = data;
> }

An opAssign from a T[] could facilitate conversion back and forth 
between code-point and code-unit ranges.

> // note, this assumes that idx is a valid index
> private size_t _charStart(size_t idx) const
> {
> static if(is(Unqual!T == wchar))
> {
> immutable c = _data.ptr[idx];
> if(c >= 0xDC00 && c <= 0xDFFF)
> {
> // surrogate pair detected, verify we have at least 2 wchars,
> // and that both wchars are properly encoded.
> assert(idx > 0, "Invalid UTF character at beginning of string");
> return idx-1;
> }
> else
> return idx;
> }
> else
> {
> const p = _data.ptr + idx;
> if ((p[0] & 0b1100_0000) != 0b1000_0000)
> {
> return idx;
> }
> else if (idx >= 1 && (p[-1] & 0b1100_0000) != 0b1000_0000)
> {
> return idx - 1;
> }
> else if (idx >= 2 && (p[-2] & 0b1100_0000) != 0b1000_0000)
> {
> return idx - 2;
> }
> else if (idx >= 3 && (p[-3] & 0b1100_0000) != 0b1000_0000)
> {
> return idx - 3;
> }
> else
> {
> assert(false, "Invalid UTF character in string");
> }
> }
> }
>
> void popFront()
> {
> auto nc = std.utf.stride(_data, 0);
> assert(nc <= _data.length && nc != 0xFF, "Invalid sequence at beginning
> of string");
> _data = _data[nc .. $];
> }
>
> void popBack()
> {
> immutable n = _data.length;
> assert(n, "Attempting to pop back of an empty string");
> _data = _data.ptr[0.._charStart(n-1)];
> }
>
> @property dchar front() const
> {
> assert(_data.length, "Attempting to fetch the front of an empty string");
> size_t i = 0;
> return decode(_data, i);
> }
>
> @property dchar back() const
> {
> immutable n = _data.length;
> assert(n, "Attempting to fetch the back of an empty string");
> auto idx = _charStart(n-1);
> return std.utf.decode(_data, idx);
> }

There is the alternative of deferring decoding to the user and returning 
T[]'s holding exactly 1 code-point instead of dchars.  I'm not sure 
which is best, but I'd be interested in seeing a case for choosing one 
or the other.

> @property bool empty() const
> {
> return !_data.length;
> }
>
> @property typeof(this) save()
> {
> return this;
> }
>
> // support read-only random access via code unit index.
> dchar opIndex(size_t idx)
> {
> idx = _charStart(idx);
> return std.utf.decode(_data, idx);
> }
>
> string_t opSlice()
> {
> return this;
> }
>
> string_t opSlice(size_t start, size_t end)
> {
> if(start != _data.length)
> start = _charStart(start);
> if(end != _data.length)
> end = _charStart(end);
> return string_t(_data[start..end]);
> }
>
> // note we don't call this length because length can be assumed to be the
> // number of elements, which this isn't.
> @property size_t codeUnits() const
> {
> return _data.length;
> }

I don't see a need for _charStart, opIndex, opSlice and codeUnits.  If 
the underlying T[] can be returned by a property, then these can be done 
through the code-unit array, which is random-access.

> // support append and concat
> // TODO: need to support appending various types of strings to eachother
> // (right now only same-type strings can be appended, or raw arrays)
> ref string_t opOpAssign(string op, U)(U data) if (op == "~" && is(U ==
> string_t))
> {
> _data ~= data._data;
> return this;
> }
>
> ref string_t opOpAssign(string op, U)(U data) if (op == "~" && !is(U ==
> string_t) && is(typeof(_data ~= U.init)))
> {
> _data ~= data;
> return this;
> }
>
> string_t opBinary(string op, U)(U data) if (op == "~" && is(U == string_t))
> {
> return string_t(_data ~ data._data);
> }
>
> string_t opBinary(string op, U)(U data) if (op == "~" && !is(U ==
> string_t) && is(typeof(_data ~ U.init)))
> {
> return string_t(_data ~ data);
> }
> }
>
> template string_t(T) if (is(Unqual!T == dchar))
> {
> alias T[] string_t;
> }
>
> /** begin test code **/
> import std.stdio;
>
> alias string_t!(immutable char) mystring;
> alias string_t!(immutable wchar) mywstring;
> alias string_t!(immutable dchar) mydstring;
>
> void main()
> {
> auto str = mystring("hello");
> str ~= " world";
> str ~= mystring("!!!");
> writeln(str._data);
> }