[review] new string type (take 2)

Steven Schveighoffer schveiguy at yahoo.com
Thu Jan 13 08:24:37 PST 2011

Based on the suggestions of others, I have updated my string type.


* opApply for iterating with foreach w/ index
* indexing the string at an invalid location (i.e. not the start of a code  
point) throws a RangeError (does that make sense, or should it be an  
* charStart is now public so you can use it to ensure you are accessing  
the start of a code point
* validIdx new function that tells you if your index is at the start of a  
code point.
* data property which gets the underlying T[]
* Added free functions for string_t!dchar to make it have the same  
properties as the other string types.
* Added an ability to assign to a T[] array for ease of use.
* Added a ptr property so it works seamlessly with code that currently  
uses strings (but we still need $, however it appears this isn't  
implemented yet in dmd).
* fully documented

Here it is:

// Written in the D programming language.

Copyright: Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010
License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License  
Authors:   $(WEB erdani.org, Andrei Alexandrescu, Steven Schveighoffer)

Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010.
Distributed under the Boost Software License, Version 1.0.
    (See accompanying file LICENSE_1_0.txt or copy at
import std.utf;
import std.traits;
import core.exception;

// BUG
version = norangeopapplyoverload;

  * Narrow string type.  This string implements utf-8 or utf-16 depending  
on the
  * character type.
  * The string type is a bi-directional range of dchars, with a monotonic
  * indexing scheme.  Essentially, because dchars are encoded at variable
  * widths, indexing and slicing based on dchars would be an O(n) operation.
  * Therefore, we allow indexing and slicing, but based on the 'code-unit'  
  * unit of encoding (wchar or char).  This means some indexes are valid and
  * some are not (those which do not point to the start of an encoded dchar  
  * not).
  * While this might seem confusing, it is very rare that one needs  
  * index access.  In order to achieve this you can either ensure to  
  * that the string's code-unit to dchar ratio is 1 to 1 (never more than  
  * code-unit to encode a dchar), or you can use the charStart funciton to
  * ensure a valid index.
struct string_t(T) if (is(Unqual!T == char) || is(Unqual!T == wchar))
     private T[] _data;

      * Constructor, builds a string based on given array data.
     this(T[] data)
         this._data = data;

      * Provide access to underlying array data.
     @property T[] data()
         return _data;

      * forward access to the ptr part of the array.  This allows the most
      * efficient operations when one knows what he is doing.
     @property T* ptr()
         return _data.ptr;

      * Finds the largest valid index in the string that is <= idx.
      * Essentially, this can be used to convert arbitrary indexes into  
      * indexes.
     size_t charStart(size_t idx) const
         static if(is(Unqual!T == wchar))
             immutable c = _data.ptr[idx];
             if(c >= 0xDC00 && c <= 0xDFFF)
                 // surrogate pair detected, verify we have at least 2  
                 // and that both wchars are properly encoded.
                 assert(idx > 0, "Invalid UTF character at beginning of  
                 return idx-1;
                 return idx;
             const p = _data.ptr + idx;
             if ((p[0] & 0b1100_0000) != 0b1000_0000)
                 return idx;
             else if (idx >= 1 && (p[-1] & 0b1100_0000) != 0b1000_0000)
                 return idx - 1;
             else if (idx >= 2 && (p[-2] & 0b1100_0000) != 0b1000_0000)
                 return idx - 2;
             else if (idx >= 3 && (p[-3] & 0b1100_0000) != 0b1000_0000)
                 return idx - 3;
                 assert(false, "Invalid UTF character in string");

      * Returns true if the given index starts an encoded dchar.
     bool validIdx(size_t idx)
         if(idx >= _data.length)
             if(idx is _data.length)
                 return true; // index one beyond the string is valid.
             return false;
         immutable c = _data[idx];
         static if(is(Unqual!T == wchar))
             // make sure this isn't the second character of a surrogate  
             return (c < 0xDC00 || c > 0xDFFF);
         else // char
             return ((c & 0b1100_0000) != 0b1000_0000);

      * remove the first code-point from the string.
     void popFront()
         auto nc = std.utf.stride(_data, 0);
         assert(nc <= _data.length && nc != 0xFF, "Invalid sequence at  
beginning of string");
         _data = _data[nc .. $];

      * Remove the last code-point from the string.
     void popBack()
         immutable n = _data.length;
         assert(n, "Attempting to pop back of an empty string");
         _data = _data.ptr[0..charStart(n-1)];

      * Get the first code-point in the string
     @property dchar front() const
         assert(_data.length, "Attempting to fetch the front of an empty  
         size_t i = 0;
         return decode(_data, i);

      * Get the last code-point in the string
     @property dchar back() const
         immutable n = _data.length;
         assert(n, "Attempting to fetch the back of an empty string");
         auto idx = charStart(n-1);
         return std.utf.decode(_data, idx);

      * Does the string contain any data?
     @property bool empty() const
         return !_data.length;

      * Copy the string (trivial function, needed for range definitions)
     @property typeof(this) save()
         return this;

      * support read-only random access via code unit index.
      * Note that an invalid idx (one that does not start a code point)  
      * in an exception
     dchar opIndex(size_t idx)
         if(idx is _data.length || !validIdx(idx))
             throw new RangeError(__FILE__, __LINE__);

         return std.utf.decode(_data, idx);

      * slice the whole string
     string_t opSlice()
         return this;

      * Slice based on valid start and end indexes.
      * Throws RangeError if start or end are not valid indexes
     string_t opSlice(size_t start, size_t end)
         if(end < start || !validIdx(start) || !validIdx(end))
             throw new RangeError(__FILE__, __LINE__);
         return string_t(_data[start..end]);

      * Get the number of code units in the string.
      * Note that this is specifically not called length because length
      * generally implies the number of elements in a range.  Since dchar  
is our
      * element type, and the number of dchars cannot be determined in O(1)
      * time, using the name length would be incorrect.
     @property size_t codeUnits() const
         return _data.length;

      * Append a string to this string.
      * TODO: support appending any string type to this string type.
     ref string_t opOpAssign(string op, U)(U data) if (op == "~" && is(U ==  
         _data ~= data._data;
         return this;

      * Support appending any types that the underlying array can support.
     ref string_t opOpAssign(string op, U)(U data) if (op == "~" && !is(U  
== string_t) && is(typeof(_data ~= U.init)))
         _data ~= data;
         return this;

      * Concatenation between two strings
     string_t opBinary(string op, U)(U data) if (op == "~" && is(U ==  
         return string_t(_data ~ data._data);

      * Support any concatenation that is supported by the underlying data
      * array.
     string_t opBinary(string op, U)(U data) if (op == "~" && !is(U ==  
string_t) && is(typeof(_data ~ U.init)))
         return string_t(_data ~ data);

     // note, this should not be required, it should use the range  
interface but
     // the compiler doesn't allow both to coexist for foreach.
          * Foreach with just dchars.  Note, you cannot actually change the
          * data, despite the argument being ref.  opApply requires ref.
         int opApply(scope int delegate(ref dchar d) dg)
             dchar d;
             size_t idx = 0;
             immutable len = _data.length;
             int result = 0;
             while(result == 0 && idx < len)
                 d = std.utf.decode(_data, idx);
                 result = dg(d);
             return result;

      * Foreach over the string with accompanied index.
      * Note, the refs are required for foreach, you cannot change the  
index or
      * the data in the string.
     int opApply(scope int delegate(ref size_t idx, ref dchar d) dg)
         dchar d;
         size_t idx = 0;
         immutable len = _data.length;
         int result = 0;
         while(result == 0 && idx < len)
             size_t tmpidx = idx;
             d = std.utf.decode(_data, idx);
             result = dg(tmpidx, d);
         return result;

      * Assign a string
     string_t opAssign(U)(U u) if(is(U == string_t))
         this._data = u._data;
         return this;

      * Assign a string from another type
     string_t opAssign(U)(U u) if (!is(U == string_t) && is(typeof(_data =  
         _data = u;
         return this;

  * String type for dchar
template string_t(T) if (is(Unqual!T == dchar))
     alias T[] string_t;

// support string functions for dchar that aren't already defined.

// TODO: do we need this one?
// TODO: should be inout instead of a template
@property T[] data(T)(T[] t) if (is(Unqual!T == dchar))
     return t;

  * Finds the largest valid index in the string that is <= idx.
  * Essentially, this can be used to convert arbitrary indexes into valid
  * indexes.
size_t charStart(const(dchar)[] t, size_t idx)
     return idx;

  * Returns true if the given index starts an encoded dchar.
bool validIdx(const(dchar)[] t, size_t idx)
     return idx <= t.length;

// TODO: do we need this one?
@property size_t codeUnits(const(dchar)[] t)
     return t.length;

/** begin test code **/
import std.stdio;

alias string_t!(immutable char) mystring;
alias string_t!(immutable wchar) mywstring;
alias string_t!(immutable dchar) mydstring;

void main()
     auto str = mystring("hello");
     foreach(dchar d; str) { }
     str ~= " world";
     str ~= mystring("!!!");
     mystring str2 = "blah blah";
     str2 = str;
     str2 = "blah blah";

More information about the Digitalmars-d mailing list