mixin template's alias parameter ... ignored ?

someone someone at somewhere.com
Mon Jul 12 01:37:37 UTC 2021


On Sunday, 11 July 2021 at 05:54:48 UTC, Ali Çehreli wrote:

> Ali

Primarily to Ali & Steve for their help, be advised, this post 
will be somehow ... long.

Some bit of background to begin with: a week or so ago I posted 
asking advice on code safeness, and still I didn't reply to the 
ones that kindly answered. Seeing some replies, and encountering 
a code issue regarding string manipulation, I pretty soon figured 
out that I still did not have solid knowledge on many basic 
things regarding D, so I put the brakes on, and went to square 
one and started reading and researching some things a bit more 
... slowly.

One of the things that struck me this week is that UniCode string 
manipulation in many cases is more complex that I previously 
thought, because there is no precise-concept of what is a 
character in UniCode, at least, not the way we are used to with 
plain-old-ASCII. After reading a lot of about it (this was good: 
https://manishearth.github.io/blog/2017/01/14/stop-ascribing-meaning-to-unicode-code-points/) I learned of code-units, code-points, abstract-graphemes, graphemes-clusters, and the like.

And I learned the inner details of the UTF encodings and that 
UTF-32 is best (almost required) for string processing (easier, 
faster, etc) and of course UTF-8 for definitive storage, and 
UTF-16 to the trashcan unless you need to interface with Windows 
(I was previously using UTF-8 within all my code for processing).

So, in order to manipulate a string, say, left(n), right(n), 
substr(n,m), ie: the usual stuff for many languages/libraries, I 
need to operate on grapheme-clusters and not in code-points and 
never ever on code-units, at least, for unexpected text, ie: 
incoming text, user-input, etc, the things that we can not 
control beforehand.

Both primary D books, Andrei's and Ali's ones, as the D 
documentation, have plenty of examples but they are mainly 
focused on simple things like strings having 
nothing-out-of-the-ordinary. They perform string manipulation 
mainly slicing the source string (ie: the char array) with the 
functions of std.range like take, takeOne, etc.

I needed to set this things once-and-for-all for my code and thus 
I decided to build a grapheme-aware UDT that once instantiated 
with any given string will provide the usual string manipulation 
functions so I can forget the minutiae about them. The unittest 
at the bottom has many usage examples.

The whole UDT needed to be templated for the three string types 
(string, dstring, wstring -and nothing else) and this was what 
produced this post to begin with. This issue was solved, not the 
way I liked to, but solved. The code works alas for something 
that smells like a phobos bug (# 20483) using foreach with 
grapheme arrays (foreach always missing the last one).

I ended up with the following (as usual advice/suggestions 
welcomed):

```d
/// testing D on 2021-06~07

import std.algorithm : map, joiner;
import std.array : array;
import std.conv : to;
import std.range : walkLength, take, tail, drop, dropBack;
import std.stdio;
import std.uni : Grapheme, byGrapheme;

alias stringUGC = Grapheme;
alias stringUGC08 = gudtUGC!(stringUTF08);
alias stringUGC16 = gudtUGC!(stringUTF16);
alias stringUGC32 = gudtUGC!(stringUTF32);
alias stringUTF08 = string;  /// same as immutable(char )[];
alias stringUTF16 = dstring; /// same as immutable(dchar)[];
alias stringUTF32 = wstring; /// same as immutable(wchar)[];

void main() {}

//mixin templateUGC!(stringUTF08, r"gudtUGC08"w); /// if these 
were possible there will be no need for stringUGC## aliases in 
main()
//mixin templateUGC!(stringUTF16, r"gudtUGC16"w);
//mixin templateUGC!(stringUTF32, r"gudtUGC32"w);

//template templateUGC (
//   typeStringUTF,
//   alias lstrStructureID
//   ) {

public struct gudtUGC(typeStringUTF) { /// UniCode grapheme 
cluster‐aware string manipulation

    void popFront() { ++pintSequenceCurrent; }
    bool empty() { return pintSequenceCurrent == 
pintSequenceCount; }
    typeStringUTF front() { return toUTFtake(pintSequenceCurrent); 
}

    private stringUGC[] pugcSequence;
    private size_t pintSequenceCount = cast(size_t) 0;
    private size_t pintSequenceCurrent = cast(size_t) 0;

    @property public size_t count() { return pintSequenceCount; }

    this(scope const typeStringUTF lstrSequence) {

       decode(lstrSequence);

    }

    @safe public size_t decode(
       scope const typeStringUTF lstrSequence
       ) {

       scope size_t lintSequenceCount = cast(size_t) 0;

       if (lstrSequence is null) {

          pugcSequence = null;
          pintSequenceCount = cast(size_t) 0;
          pintSequenceCurrent = cast(size_t) 0;

       } else {

          pugcSequence = lstrSequence.byGrapheme.array;
          pintSequenceCount = pugcSequence.walkLength;
          pintSequenceCurrent = cast(size_t) 1;

          lintSequenceCount = pintSequenceCount;

       }

       return lintSequenceCount;

    }

    @safe public typeStringUTF encode() { /// UniCode grapheme 
cluster to UniCode UTF‐encoded string

       scope typeStringUTF lstrSequence = null;

       if (pintSequenceCount >= cast(size_t) 1) {

          lstrSequence = pugcSequence
             .map!((ref g) => g[])
             .joiner
             .to!(typeStringUTF)
             ;

       }

       return lstrSequence;

    }

    @safe public typeStringUTF toUTFtake( /// UniCode grapheme 
cluster to UniCode UTF‐encoded string
       scope const size_t lintStart,
       scope const size_t lintCount = cast(size_t) 1
       ) {

       scope typeStringUTF lstrSequence = null;

       if (lintStart <= lintStart + lintCount) {

          /// eg#1: toUTFtake(1,3) → range#1=start-1=1-1=0 and 
range#2=range#1+count=0+3=3 → 0..3
          /// eg#1: toUTFtake(6,3) → range#2=start-1=6-1=5 and 
range#2=range#1+count=5+3=8 → 5..8

          /// eg#2: toUTFtake(01,1) → range#1=start-1=01-1=00 and 
range#2=range#1+count=00+1=01 → 00..01
          /// eg#2: toUTFtake(50,1) → range#2=start-1=50-1=49 and 
range#2=range#1+count=49+1=50 → 49..50

          scope size_t lintRange1 = lintStart - cast(size_t) 1;
          scope size_t lintRange2 = lintRange1 + lintCount;

          if (lintRange1 >= cast(size_t) 0 && lintRange2 <= 
pintSequenceCount) {

             lstrSequence = pugcSequence[lintRange1..lintRange2]
                .map!((ref g) => g[])
                .joiner
                .to!(typeStringUTF)
                ;

          }

       }

       return lstrSequence;

    }

    @safe public typeStringUTF toUTFtakeL( /// UniCode grapheme 
cluster to UniCode UTF‐encoded string
       scope const size_t lintCount
       ) {

       scope typeStringUTF lstrSequence = null;

       if (lintCount <= pintSequenceCount) {

          lstrSequence = pugcSequence
             .take(lintCount)
             .map!((ref g) => g[])
             .joiner
             .to!(typeStringUTF)
             ;

       }

       return lstrSequence;

    }

    @safe public typeStringUTF toUTFtakeR( /// UniCode grapheme 
cluster to UniCode UTF‐encoded string
       scope const size_t lintCount
       ) {

       scope typeStringUTF lstrSequence = null;

       if (lintCount <= pintSequenceCount) {

          lstrSequence = pugcSequence
             .tail(lintCount)
             .map!((ref g) => g[])
             .joiner
             .to!(typeStringUTF)
             ;

       }

       return lstrSequence;

    }

    @safe public typeStringUTF toUTFchopL( /// UniCode grapheme 
cluster to UniCode UTF‐encoded string
       scope const size_t lintCount
       ) {

       scope typeStringUTF lstrSequence = null;

       if (lintCount <= pintSequenceCount) {

          lstrSequence = pugcSequence
             .drop(lintCount)
             .map!((ref g) => g[])
             .joiner
             .to!(typeStringUTF)
             ;

       }

       return lstrSequence;

    }

    @safe public typeStringUTF toUTFchopR( /// UniCode grapheme 
cluster to UniCode UTF‐encoded string
       scope const size_t lintCount
       ) {

       scope typeStringUTF lstrSequence = null;

       if (lintCount <= pintSequenceCount) {

          lstrSequence = pugcSequence
             .dropBack(lintCount)
             .map!((ref g) => g[])
             .joiner
             .to!(typeStringUTF)
             ;

       }

       return lstrSequence;

    }

    @safe public typeStringUTF toUTFpadL( /// UniCode grapheme 
cluster to UniCode UTF‐encoded string
       scope const size_t lintCount,
       scope const typeStringUTF lstrPadding = cast(typeStringUTF) 
r" "
       ) {

       scope typeStringUTF lstrSequence = null;

       if (lintCount > pintSequenceCount) {

          lstrSequence = null; /// pending

       }

       return lstrSequence;

    }

    @safe public typeStringUTF toUTFpadR( /// UniCode grapheme 
cluster to UniCode UTF‐encoded string
       scope const size_t lintCount,
       scope const typeStringUTF lstrPadding = cast(typeStringUTF) 
r" "
       ) {

       scope typeStringUTF lstrSequence = null;

       if (lintCount > pintSequenceCount) {

          lstrSequence = null; /// pending

       }

       return lstrSequence;

    }

    /*@safe public gudtUGC(typeStringUTF) take(
       scope const size_t lintStart,
       scope const size_t lintCount = cast(size_t) 1
       ) {

       /// the idea behind this new set of functions (returning a 
new object) is to enable the following one‐liner constructions:
       /// assert(lugcSequence3.take(35, 
3).take(1,2).take(1,1).encode() == cast(stringUTF) r"日");

       /// ooops … error: function declaration without return 
type. (Note that constructors are always named `this`)
       /// ooops … error: no identifier for declarator `@safe 
gudtUGC(typeStringUTF)`

       scope gudtUGC(typeStringUTF) lugcSequence;

       if (lintStart <= lintStart + lintCount) {

          /// eg#1: toUTFtake(1,3) → range#1=start-1=1-1=0 and 
range#2=range#1+count=0+3=3 → 0..3
          /// eg#1: toUTFtake(6,3) → range#2=start-1=6-1=5 and 
range#2=range#1+count=5+3=8 → 5..8

          /// eg#2: toUTFtake(01,1) → range#1=start-1=01-1=00 and 
range#2=range#1+count=00+1=01 → 00..01
          /// eg#2: toUTFtake(50,1) → range#2=start-1=50-1=49 and 
range#2=range#1+count=49+1=50 → 49..50

          scope size_t lintRange1 = lintStart - cast(size_t) 1;
          scope size_t lintRange2 = lintRange1 + lintCount;

          if (lintRange1 >= cast(size_t) 0 && lintRange2 <= 
pintSequenceCount) {

             lugcSequence = 
gudtUGC(typeStringUTF)(pugcSequence[lintRange1..lintRange2]
                .map!((ref g) => g[])
                .joiner
                .to!(typeStringUTF)
                );

          }

       }

       return lugcSequence;

    }*/

}

//}

unittest {

    version (useUTF08) {
    scope stringUTF08 lstrSequence1 = 
r"12345678901234567890123456789012345678901234567890"c;
    scope stringUTF08 lstrSequence2 = 
r"1234567890АВГДЕЗИЙКЛABCDEFGHIJabcdefghijQRSTUVWXYZ"c;
    scope stringUTF08 lstrSequence3 = "äëåčñœß … russian = русский 
🇷🇺 ≠ 🇯🇵 日本語 = japanese 😎"c;
    }

    version (useUTF16) {
    scope stringUTF16 lstrSequence1 = 
r"12345678901234567890123456789012345678901234567890"d;
    scope stringUTF16 lstrSequence2 = 
r"1234567890АВГДЕЗИЙКЛABCDEFGHIJabcdefghijQRSTUVWXYZ"d;
    scope stringUTF16 lstrSequence3 = "äëåčñœß … russian = русский 
🇷🇺 ≠ 🇯🇵 日本語 = japanese 😎"d;
    }

    version (useUTF32) {
    scope stringUTF32 lstrSequence1 = 
r"12345678901234567890123456789012345678901234567890"w;
    scope stringUTF32 lstrSequence2 = 
r"1234567890АВГДЕЗИЙКЛABCDEFGHIJabcdefghijQRSTUVWXYZ"w;
    scope stringUTF32 lstrSequence3 = "äëåčñœß … russian = русский 
🇷🇺 ≠ 🇯🇵 日本語 = japanese 😎"w;
    }

    scope size_t lintSequence1sizeUTF = lstrSequence1.length;
    scope size_t lintSequence2sizeUTF = lstrSequence2.length;
    scope size_t lintSequence3sizeUTF = lstrSequence3.length;

    scope size_t lintSequence1sizeUGA = lstrSequence1.walkLength;
    scope size_t lintSequence2sizeUGA = lstrSequence2.walkLength;
    scope size_t lintSequence3sizeUGA = lstrSequence3.walkLength;

    scope size_t lintSequence1sizeUGC = 
lstrSequence1.byGrapheme.walkLength;
    scope size_t lintSequence2sizeUGC = 
lstrSequence2.byGrapheme.walkLength;
    scope size_t lintSequence3sizeUGC = 
lstrSequence3.byGrapheme.walkLength;

    assert(lintSequence1sizeUGC == cast(size_t) 50);
    assert(lintSequence2sizeUGC == cast(size_t) 50);
    assert(lintSequence3sizeUGC == cast(size_t) 50);

    assert(lintSequence1sizeUGA == cast(size_t) 50);
    assert(lintSequence2sizeUGA == cast(size_t) 50);
    assert(lintSequence3sizeUGA == cast(size_t) 52);

    version (useUTF08) {
    assert(lintSequence1sizeUTF == cast(size_t) 50);
    assert(lintSequence2sizeUTF == cast(size_t) 60);
    assert(lintSequence3sizeUTF == cast(size_t) 91);
    }

    version (useUTF16) {
    assert(lintSequence1sizeUTF == cast(size_t) 50);
    assert(lintSequence2sizeUTF == cast(size_t) 50);
    assert(lintSequence3sizeUTF == cast(size_t) 52);
    }

    version (useUTF32) {
    assert(lintSequence1sizeUTF == cast(size_t) 50);
    assert(lintSequence2sizeUTF == cast(size_t) 50);
    assert(lintSequence3sizeUTF == cast(size_t) 57);
    }

    /// the following should be the same regardless of the 
encoding being used and is the whole point of this UDT being made:

    version (useUTF08) { alias stringUTF = stringUTF08; scope 
stringUGC08 lugcSequence3 = stringUGC08(lstrSequence3); }
    version (useUTF16) { alias stringUTF = stringUTF16; scope 
stringUGC16 lugcSequence3 = stringUGC16(lstrSequence3); }
    version (useUTF32) { alias stringUTF = stringUTF32; scope 
stringUGC32 lugcSequence3 = stringUGC32(lstrSequence3); }

    assert(lugcSequence3.encode() == lstrSequence3);

    assert(lugcSequence3.toUTFtake(21) == cast(stringUTF) r"р");
    assert(lugcSequence3.toUTFtake(27) == cast(stringUTF) r"й");
    assert(lugcSequence3.toUTFtake(35) == cast(stringUTF) r"日");
    assert(lugcSequence3.toUTFtake(37) == cast(stringUTF) r"語");
    assert(lugcSequence3.toUTFtake(21, 7) == cast(stringUTF) 
r"русский");
    assert(lugcSequence3.toUTFtake(35, 3) == cast(stringUTF) 
r"日本語");

    assert(lugcSequence3.toUTFtakeL(1) == cast(stringUTF) r"ä");
    assert(lugcSequence3.toUTFtakeR(1) == cast(stringUTF) r"😎");
    assert(lugcSequence3.toUTFtakeL(7) == cast(stringUTF) 
r"äëåčñœß");
    assert(lugcSequence3.toUTFtakeR(16) == cast(stringUTF) r"日本語 = 
japanese 😎");

    assert(lugcSequence3.toUTFchopL(10) == cast(stringUTF) 
r"russian = русский 🇷🇺 ≠ 🇯🇵 日本語 = japanese 😎");
    assert(lugcSequence3.toUTFchopR(21) == cast(stringUTF) 
r"äëåčñœß … russian = русский 🇷🇺");

    version (useUTF08) { scope stringUTF08 lstrSequence3reencoded; 
}
    version (useUTF16) { scope stringUTF16 lstrSequence3reencoded; 
}
    version (useUTF32) { scope stringUTF32 lstrSequence3reencoded; 
}

    for (
       size_t lintSequenceUGC = cast(size_t) 1;
       lintSequenceUGC <= lintSequence3sizeUGC;
       ++lintSequenceUGC
       ) {

       lstrSequence3reencoded ~= 
lugcSequence3.toUTFtake(lintSequenceUGC);

    }

    assert(lstrSequence3reencoded == lstrSequence3);

    lstrSequence3reencoded = null;

    version (useUTF08) { foreach (stringUTF08 lstrSequence3UGC; 
lugcSequence3) { lstrSequence3reencoded ~= lstrSequence3UGC; } }
    version (useUTF16) { foreach (stringUTF16 lstrSequence3UGC; 
lugcSequence3) { lstrSequence3reencoded ~= lstrSequence3UGC; } }
    version (useUTF32) { foreach (stringUTF32 lstrSequence3UGC; 
lugcSequence3) { lstrSequence3reencoded ~= lstrSequence3UGC; } }

    assert(lstrSequence3reencoded == lstrSequence3); /// ooops … 
missing last grapheme: possible bug # 20483

}
```


More information about the Digitalmars-d-learn mailing list