[review] new string type

Steven Schveighoffer schveiguy at yahoo.com
Tue Nov 30 07:48:03 PST 2010


In a prior thread, I promised to create a narrow string type that would  
enforce the requirements of narrow strings.  That is, the new string type  
should respect the encoding of narrow strings.

Here is a rough draft, tested minimally, but it does compile and pass  
simple tests.  It's pretty simple, which is what I would expect.  I copied  
a lot of stuff from std.array to get this to work.

The point of this type is -- if we replace what the compiler considers  
"strings" with this type, then we get both the compiler *and* phobos  
agreeing as to what this type is:  A bi-directional range of dchar.

As a bonus, char[] and wchar[] now would become arrays and be manipulated  
consistently with other arrays, which if not done correctly could cause  
problems, but may provide more flexibility and opportunity for  
performance.  Instead of the library fighting you on it.

Anyways, here it is, released under the boost license, commence attack ;)


// Written in the D programming language.

/**
Copyright: Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010
License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License  
1.0</a>.
Authors:   $(WEB erdani.org, Andrei Alexandrescu, Steven Schveighoffer)

Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010.
Distributed under the Boost Software License, Version 1.0.
    (See accompanying file LICENSE_1_0.txt or copy at
          http://www.boost.org/LICENSE_1_0.txt)
*/
import std.utf;
import std.traits;

struct string_t(T) if (is(Unqual!T == char) || is(Unqual!T == wchar))
{
     private T[] _data;
     this(T[] data)
     {
         this._data = data;
     }

     // note, this assumes that idx is a valid index
     private size_t _charStart(size_t idx) const
     {
         static if(is(Unqual!T == wchar))
         {
             immutable c = _data.ptr[idx];
             if(c >= 0xDC00 && c <= 0xDFFF)
             {
                 // surrogate pair detected, verify we have at least 2  
wchars,
                 // and that both wchars are properly encoded.
                 assert(idx > 0, "Invalid UTF character at beginning of  
string");
                 return idx-1;
             }
             else
                 return idx;
         }
         else
         {
             const p = _data.ptr + idx;
             if ((p[0] & 0b1100_0000) != 0b1000_0000)
             {
                 return idx;
             }
             else if (idx >= 1 && (p[-1] & 0b1100_0000) != 0b1000_0000)
             {
                 return idx - 1;
             }
             else if (idx >= 2 && (p[-2] & 0b1100_0000) != 0b1000_0000)
             {
                 return idx - 2;
             }
             else if (idx >= 3 && (p[-3] & 0b1100_0000) != 0b1000_0000)
             {
                 return idx - 3;
             }
             else
             {
                 assert(false, "Invalid UTF character in string");
             }
         }
     }

     void popFront()
     {
         auto nc = std.utf.stride(_data, 0);
         assert(nc <= _data.length && nc != 0xFF, "Invalid sequence at  
beginning of string");
         _data = _data[nc .. $];
     }

     void popBack()
     {
         immutable n = _data.length;
         assert(n, "Attempting to pop back of an empty string");
         _data = _data.ptr[0.._charStart(n-1)];
     }

     @property dchar front() const
     {
         assert(_data.length, "Attempting to fetch the front of an empty  
string");
         size_t i = 0;
         return decode(_data, i);
     }

     @property dchar back() const
     {
         immutable n = _data.length;
         assert(n, "Attempting to fetch the back of an empty string");
         auto idx = _charStart(n-1);
         return std.utf.decode(_data, idx);
     }

     @property bool empty() const
     {
         return !_data.length;
     }

     @property typeof(this) save()
     {
         return this;
     }

     // support read-only random access via code unit index.
     dchar opIndex(size_t idx)
     {
         idx = _charStart(idx);
         return std.utf.decode(_data, idx);
     }

     string_t opSlice()
     {
         return this;
     }

     string_t opSlice(size_t start, size_t end)
     {
         if(start != _data.length)
             start = _charStart(start);
         if(end != _data.length)
             end = _charStart(end);
         return string_t(_data[start..end]);
     }

     // note we don't call this length because length can be assumed to be  
the
     // number of elements, which this isn't.
     @property size_t codeUnits() const
     {
         return _data.length;
     }

     // support append and concat
     // TODO: need to support appending various types of strings to  
eachother
     // (right now only same-type strings can be appended, or raw arrays)
     ref string_t opOpAssign(string op, U)(U data) if (op == "~" && is(U ==  
string_t))
     {
         _data ~= data._data;
         return this;
     }

     ref string_t opOpAssign(string op, U)(U data) if (op == "~" && !is(U  
== string_t) && is(typeof(_data ~= U.init)))
     {
         _data ~= data;
         return this;
     }

     string_t opBinary(string op, U)(U data) if (op == "~" && is(U ==  
string_t))
     {
         return string_t(_data ~ data._data);
     }

     string_t opBinary(string op, U)(U data) if (op == "~" && !is(U ==  
string_t) && is(typeof(_data ~ U.init)))
     {
         return string_t(_data ~ data);
     }
}

template string_t(T) if (is(Unqual!T == dchar))
{
     alias T[] string_t;
}

/** begin test code **/
import std.stdio;

alias string_t!(immutable char) mystring;
alias string_t!(immutable wchar) mywstring;
alias string_t!(immutable dchar) mydstring;

void main()
{
     auto str = mystring("hello");
     str ~= " world";
     str ~= mystring("!!!");
     writeln(str._data);
}


More information about the Digitalmars-d mailing list