[Issue 12768] @nogc algorithm.splitter

via Digitalmars-d-bugs digitalmars-d-bugs at puremagic.com
Tue May 20 03:37:43 PDT 2014


https://issues.dlang.org/show_bug.cgi?id=12768

--- Comment #1 from bearophile_hugs at eml.cc ---
The problem is not immediate to solve.


You can fix splitter allocating the RangeError statically: 


auto splitter(C)(C[] s) if (isSomeChar!C) {
    static struct Result {
    private:
        import core.exception;
        C[] _s;
        size_t _frontLength;
        static err = new immutable(RangeError)();

        void getFirst() pure @safe @nogc {
            auto r = find!(isWhite)(_s);
            _frontLength = _s.length - r.length;
        }

    public:
        this(C[] s) pure @safe @nogc {
            import std.string: strip;
            _s = s.strip();
            getFirst();
        }

        @property C[] front() pure @safe @nogc {
            version(assert) if (empty) throw err;
            return _s[0 .. _frontLength];
        }

        void popFront() pure @safe @nogc {
            import std.string: stripLeft;
            version(assert) if (empty) throw err;
            _s = _s[_frontLength .. $].stripLeft();
            getFirst();
        }

        @property bool empty() const @safe pure nothrow @nogc {
            return _s.empty;
        }

        @property inout(Result) save() inout @safe pure nothrow @nogc {
            return this;
        }
    }

    return Result(s);
}



but std.utf.isWhite is not @nogc. You can fix it:

bool isWhiteGen(dchar ch) @safe pure nothrow @nogc
{
    if(ch < 133)
    {
        if(ch < 9) return false;
        if(ch < 14) return true;
        if(ch == 32) return true;
        return false;
    }
    else if (ch < 134) return true;
    else
    {
        if(ch < 8232)
        {
            if(ch < 5760)
            {
                if(ch == 160) return true;
                return false;
            }
            else if (ch < 5761) return true;
            else
            {
                if(ch < 8192) return false;
                if(ch < 8203) return true;
                return false;
            }
        }
        else if (ch < 8234) return true;
        else
        {
            if(ch < 8287)
            {
                if(ch == 8239) return true;
                return false;
            }
            else if (ch < 8288) return true;
            else
            {
                if(ch == 12288) return true;
                return false;
            }
        }
    }
}



public bool isWhite(dchar c) @safe pure @nogc nothrow {
    return isWhiteGen(c); // call pregenerated binary search
}


But now it's std.algorithm.find that is not @nogc. With few experiments you see
that std.algorithm.find is not pure only with strings, so what's to blame is
std.utf.decode:


dchar decode(S)(auto ref S str, ref size_t index) @trusted pure @nogc
    if (isSomeString!S)
in
{
    assert(index < str.length, "Attempted to decode past the end of a string");
}
out (result)
{
    assert(isValidDchar(result));
}
body
{
    if (str[index] < codeUnitLimit!S)
        return str[index++];
    return decodeImpl!true(str, index);
}



codeUnitLimit is OK, and fixing isValidDchar is easy, just add @nogc:

bool isValidDchar(dchar c) pure nothrow @safe @nogc {
    /* Note: FFFE and FFFF are specifically permitted by the
     * Unicode standard for application internal use, but are not
     * allowed for interchange.
     * (thanks to Arcane Jill)
     */

    return c < 0xD800 ||
          (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
}



But fixing decodeImpl is less easy.


It contains three functions. The case where canIndex is false are easy to
handle, just allocate the UTFException as static immutables. When canIndex is
true the function exception() is less easy to make @nogc:


    static if (canIndex)
    {
        static UTFException exception(S)(S str, string msg)
        {
            uint[4] sequence = void;
            size_t i;

            do
            {
                sequence[i] = str[i];
            } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);

            return new UTFException(msg, i).setSequence(sequence[0 .. i]);
        }
    }

    UTFException invalidUTF()
    {
        static if (canIndex)
           return exception(pstr[0 .. length], "Invalid UTF-8 sequence");
        else
        {
            //We can't include the invalid sequence with input strings without
            //saving each of the code units along the way, and we can't do it
with
            //forward ranges without saving the entire range. Both would incur
a
            //cost for the decoding of every character just to provide a better
            //error message for the (hopefully) rare case when an invalid UTF-8
            //sequence is encountered, so we don't bother trying to include the
            //invalid sequence here, unlike with strings and sliceable ranges.
           return new UTFException("Invalid UTF-8 sequence");
        }
    }

    UTFException ()
    {
        static if (canIndex)
           return exception(pstr[0 .. length], "Attempted to decode past the
end of a string");
        else
           return new UTFException("Attempted to decode past the end of a
string");
    }



UTFException.setSequence is easy to annotate as @nogc, this allows invalidUTF
and outOfBounds to be tagged @nogc as long as exception() is @nogc:

    @safe pure nothrow @nogc
    UTFException setSequence(uint[] data...) {
        import std.algorithm;

        assert(data.length <= 4);

        len = min(data.length, 4);
        sequence[0 .. len] = data[0 .. len];

        return this;
    }


But exception() contains this allocation:

return new UTFException(msg, i).setSequence(sequence[0 .. i]);


And UTFException contains:

class UTFException : Exception
{
    uint[4] sequence;
    size_t  len;
...

    @safe pure
    this(string msg, size_t index, string file = __FILE__, size_t line =
__LINE__, Throwable next = null) {
        import std.string;
        super(msg ~ format(" (at index %s)", index), file, line, next);
    }



So decodeImpl (and consequently std.utf.decode and then find and then splitter)
can't be @nogc because it is able to give handy error messages with index like
this:

temp3.UTFException at temp3.d(120): Invalid UTF-8 sequence (at index 2)


The error message can be created filling a local fixed-size array:


uint writeNumber(char[] buf, ulong n) nothrow pure @safe @nogc {
    char[30] aux; // A ulong can't be more than 20 digits long.
    auto pos = aux.length - 1;

    if (n == 0) {
        aux[pos] = '0';
        pos--;
    } else {
        while (n) {
            aux[pos] = (n % 10) + '0';
            pos--;
            n /= 10;
        }
    }

    immutable len = aux.length - pos - 1;
    buf[0 .. len] = aux[pos + 1 .. $];
    return len;
}
void main() nothrow pure @safe @nogc {
    static immutable msg = "Invalid UTF-8 sequence (at index ";
    char[msg.length + 30] buffer;
    buffer[0 .. msg.length] = msg[];
    auto index = ulong.max; // Biggest index.
    immutable len = writeNumber(buffer[msg.length .. $], index);
    buffer[msg.length + len] = ')';
    auto result = buffer[0 .. msg.length + len + 1];
    assert(result == "Invalid UTF-8 sequence (at index 18446744073709551615)");
}


The problem are the calls to super() of Exception, that are not @nogc and
Exception isn't a template so @nogc can't be inferred.

--


More information about the Digitalmars-d-bugs mailing list