[review] new string type (take 2)

Thu Jan 13 08:24:37 PST 2011

Based on the suggestions of others, I have updated my string type.

changes:

* opApply for iterating with foreach w/ index
* indexing the string at an invalid location (i.e. not the start of a code  
point) throws a RangeError (does that make sense, or should it be an  
exception?).
* charStart is now public so you can use it to ensure you are accessing  
the start of a code point
* validIdx new function that tells you if your index is at the start of a  
code point.
* data property which gets the underlying T[]
* Added free functions for string_t!dchar to make it have the same  
properties as the other string types.
* Added an ability to assign to a T[] array for ease of use.
* Added a ptr property so it works seamlessly with code that currently  
uses strings (but we still need $, however it appears this isn't  
implemented yet in dmd).
* fully documented

Here it is:

// Written in the D programming language.

/**
Copyright: Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010
License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License  
1.0</a>.
Authors:   $(WEB erdani.org, Andrei Alexandrescu, Steven Schveighoffer)

Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010.
Distributed under the Boost Software License, Version 1.0.
    (See accompanying file LICENSE_1_0.txt or copy at
          http://www.boost.org/LICENSE_1_0.txt)
*/
import std.utf;
import std.traits;
import core.exception;

// BUG
version = norangeopapplyoverload;

/**
  * Narrow string type.  This string implements utf-8 or utf-16 depending  
on the
  * character type.
  *
  * The string type is a bi-directional range of dchars, with a monotonic
  * indexing scheme.  Essentially, because dchars are encoded at variable
  * widths, indexing and slicing based on dchars would be an O(n) operation.
  * Therefore, we allow indexing and slicing, but based on the 'code-unit'  
or
  * unit of encoding (wchar or char).  This means some indexes are valid and
  * some are not (those which do not point to the start of an encoded dchar  
are
  * not).
  *
  * While this might seem confusing, it is very rare that one needs  
arbitrary
  * index access.  In order to achieve this you can either ensure to  
yourself
  * that the string's code-unit to dchar ratio is 1 to 1 (never more than  
one
  * code-unit to encode a dchar), or you can use the charStart funciton to
  * ensure a valid index.
  */
struct string_t(T) if (is(Unqual!T == char) || is(Unqual!T == wchar))
{
     private T[] _data;

     /**
      * Constructor, builds a string based on given array data.
      */
     this(T[] data)
     {
         this._data = data;
     }

     /**
      * Provide access to underlying array data.
      */
     @property T[] data()
     {
         return _data;
     }

     /**
      * forward access to the ptr part of the array.  This allows the most
      * efficient operations when one knows what he is doing.
      */
     @property T* ptr()
     {
         return _data.ptr;
     }

     /**
      * Finds the largest valid index in the string that is <= idx.
      * Essentially, this can be used to convert arbitrary indexes into  
valid
      * indexes.
      */
     size_t charStart(size_t idx) const
     {
         static if(is(Unqual!T == wchar))
         {
             immutable c = _data.ptr[idx];
             if(c >= 0xDC00 && c <= 0xDFFF)
             {
                 // surrogate pair detected, verify we have at least 2  
wchars,
                 // and that both wchars are properly encoded.
                 assert(idx > 0, "Invalid UTF character at beginning of  
string");
                 return idx-1;
             }
             else
                 return idx;
         }
         else
         {
             const p = _data.ptr + idx;
             if ((p[0] & 0b1100_0000) != 0b1000_0000)
             {
                 return idx;
             }
             else if (idx >= 1 && (p[-1] & 0b1100_0000) != 0b1000_0000)
             {
                 return idx - 1;
             }
             else if (idx >= 2 && (p[-2] & 0b1100_0000) != 0b1000_0000)
             {
                 return idx - 2;
             }
             else if (idx >= 3 && (p[-3] & 0b1100_0000) != 0b1000_0000)
             {
                 return idx - 3;
             }
             else
             {
                 assert(false, "Invalid UTF character in string");
             }
         }
     }

     /**
      * Returns true if the given index starts an encoded dchar.
      */
     bool validIdx(size_t idx)
     {
         if(idx >= _data.length)
         {
             if(idx is _data.length)
                 return true; // index one beyond the string is valid.
             return false;
         }
         immutable c = _data[idx];
         static if(is(Unqual!T == wchar))
         {
             // make sure this isn't the second character of a surrogate  
pair
             return (c < 0xDC00 || c > 0xDFFF);
         }
         else // char
         {
             return ((c & 0b1100_0000) != 0b1000_0000);
         }
     }

     /**
      * remove the first code-point from the string.
      */
     void popFront()
     {
         auto nc = std.utf.stride(_data, 0);
         assert(nc <= _data.length && nc != 0xFF, "Invalid sequence at  
beginning of string");
         _data = _data[nc .. $];
     }

     /**
      * Remove the last code-point from the string.
      */
     void popBack()
     {
         immutable n = _data.length;
         assert(n, "Attempting to pop back of an empty string");
         _data = _data.ptr[0..charStart(n-1)];
     }

     /**
      * Get the first code-point in the string
      */
     @property dchar front() const
     {
         assert(_data.length, "Attempting to fetch the front of an empty  
string");
         size_t i = 0;
         return decode(_data, i);
     }

     /**
      * Get the last code-point in the string
      */
     @property dchar back() const
     {
         immutable n = _data.length;
         assert(n, "Attempting to fetch the back of an empty string");
         auto idx = charStart(n-1);
         return std.utf.decode(_data, idx);
     }

     /**
      * Does the string contain any data?
      */
     @property bool empty() const
     {
         return !_data.length;
     }

     /**
      * Copy the string (trivial function, needed for range definitions)
      */
     @property typeof(this) save()
     {
         return this;
     }

     /**
      * support read-only random access via code unit index.
      *
      * Note that an invalid idx (one that does not start a code point)  
results
      * in an exception
      */
     dchar opIndex(size_t idx)
     {
         if(idx is _data.length || !validIdx(idx))
             throw new RangeError(__FILE__, __LINE__);

         return std.utf.decode(_data, idx);
     }

     /**
      * slice the whole string
      */
     string_t opSlice()
     {
         return this;
     }

     /**
      * Slice based on valid start and end indexes.
      *
      * Throws RangeError if start or end are not valid indexes
      */
     string_t opSlice(size_t start, size_t end)
     {
         if(end < start || !validIdx(start) || !validIdx(end))
             throw new RangeError(__FILE__, __LINE__);
         return string_t(_data[start..end]);
     }

     /**
      * Get the number of code units in the string.
      *
      * Note that this is specifically not called length because length
      * generally implies the number of elements in a range.  Since dchar  
is our
      * element type, and the number of dchars cannot be determined in O(1)
      * time, using the name length would be incorrect.
      */
     @property size_t codeUnits() const
     {
         return _data.length;
     }

     /**
      * Append a string to this string.
      *
      * TODO: support appending any string type to this string type.
      */
     ref string_t opOpAssign(string op, U)(U data) if (op == "~" && is(U ==  
string_t))
     {
         _data ~= data._data;
         return this;
     }

     /**
      * Support appending any types that the underlying array can support.
      */
     ref string_t opOpAssign(string op, U)(U data) if (op == "~" && !is(U  
== string_t) && is(typeof(_data ~= U.init)))
     {
         _data ~= data;
         return this;
     }

     /**
      * Concatenation between two strings
      */
     string_t opBinary(string op, U)(U data) if (op == "~" && is(U ==  
string_t))
     {
         return string_t(_data ~ data._data);
     }

     /**
      * Support any concatenation that is supported by the underlying data
      * array.
      */
     string_t opBinary(string op, U)(U data) if (op == "~" && !is(U ==  
string_t) && is(typeof(_data ~ U.init)))
     {
         return string_t(_data ~ data);
     }

     // note, this should not be required, it should use the range  
interface but
     // the compiler doesn't allow both to coexist for foreach.
     version(norangeopapplyoverload)
     {
         /**
          * Foreach with just dchars.  Note, you cannot actually change the
          * data, despite the argument being ref.  opApply requires ref.
          */
         int opApply(scope int delegate(ref dchar d) dg)
         {
             dchar d;
             size_t idx = 0;
             immutable len = _data.length;
             int result = 0;
             while(result == 0 && idx < len)
             {
                 d = std.utf.decode(_data, idx);
                 result = dg(d);
             }
             return result;
         }
     }

     /**
      * Foreach over the string with accompanied index.
      *
      * Note, the refs are required for foreach, you cannot change the  
index or
      * the data in the string.
      */
     int opApply(scope int delegate(ref size_t idx, ref dchar d) dg)
     {
         dchar d;
         size_t idx = 0;
         immutable len = _data.length;
         int result = 0;
         while(result == 0 && idx < len)
         {
             size_t tmpidx = idx;
             d = std.utf.decode(_data, idx);
             result = dg(tmpidx, d);
         }
         return result;
     }

     /**
      * Assign a string
      */
     string_t opAssign(U)(U u) if(is(U == string_t))
     {
         this._data = u._data;
         return this;
     }

     /**
      * Assign a string from another type
      */
     string_t opAssign(U)(U u) if (!is(U == string_t) && is(typeof(_data =  
u)))
     {
         _data = u;
         return this;
     }
}

/**
  * String type for dchar
  */
template string_t(T) if (is(Unqual!T == dchar))
{
     alias T[] string_t;
}

// support string functions for dchar that aren't already defined.

// TODO: do we need this one?
// TODO: should be inout instead of a template
@property T[] data(T)(T[] t) if (is(Unqual!T == dchar))
{
     return t;
}

/**
  * Finds the largest valid index in the string that is <= idx.
  * Essentially, this can be used to convert arbitrary indexes into valid
  * indexes.
  */
size_t charStart(const(dchar)[] t, size_t idx)
{
     return idx;
}

/**
  * Returns true if the given index starts an encoded dchar.
  */
bool validIdx(const(dchar)[] t, size_t idx)
{
     return idx <= t.length;
}

// TODO: do we need this one?
@property size_t codeUnits(const(dchar)[] t)
{
     return t.length;
}

/** begin test code **/
import std.stdio;

alias string_t!(immutable char) mystring;
alias string_t!(immutable wchar) mywstring;
alias string_t!(immutable dchar) mydstring;

void main()
{
     auto str = mystring("hello");
     foreach(dchar d; str) { }
     str ~= " world";
     str ~= mystring("!!!");
     writeln(str.data);
     mystring str2 = "blah blah";
     str2 = str;
     str2 = "blah blah";
}