dmd foreach loops throw exceptions on invalid UTF sequences, use replacementDchar instead

jfondren julian.fondren at gmail.com
Thu Nov 4 14:52:56 UTC 2021


On Thursday, 4 November 2021 at 02:26:20 UTC, Walter Bright wrote:
> https://issues.dlang.org/show_bug.cgi?id=22473

This doesn't throw, actually:

```d
unittest {
     import std.stdio : writeln;
     enum invalid = "hello\247\205\257there";

     foreach (c; invalid)
         writeln(cast(ubyte) c);
}
```

Which is per usual in D

```d
@("std.utf.byUTF 2/3 (throwing)")
@safe unittest {
     import std.utf : byUTF, UTFException, UseReplacementDchar;
     import std.exception : assertThrown, assertNotThrown;
     import std.algorithm : count;

     string partial = "hello\247\205\257there";

     // byChar misses the bad UTF8 ...
     assertNotThrown!UTFException(partial.byUTF!(char, 
UseReplacementDchar.no).count);

     // byDchar objects to it
     assertThrown!UTFException(partial.byUTF!(dchar, 
UseReplacementDchar.no).count);
}
```

This does throw:

```d
unittest {
     import std.stdio : writeln;
     enum invalid = "hello\247\205\257there";

     foreach (dchar c; invalid)
         writeln(cast(int) c);
}
```

but by asking for dchars from an immutable(char)[] you're asking 
for some unicode work to happen, so throwing is a reasonable 
default IMO. Emitting the replacement character is also a 
reasonable default, and objections in the thread can be answered 
the same way that objections to throwing can be: if you don't 
like it, iterate some other way:

```d
// throw on invalid UTF
unittest {
     import std.utf : byUTF, UseReplacementDchar, UTFException;

     enum invalid = "hello\247\205\257there";

     int sum;
     try {
         foreach (dchar c; invalid.byUTF!(dchar, 
UseReplacementDchar.no))
             sum += cast(int) c;
         assert(sum == 197667);
     } catch (UTFException e) {
         assert(sum == 532);
     }
}

// AssertError on invalid UTF
// (release behavior: "\247\205\257" is three dchars!)
unittest {
     import std.stdio : writeln;
     import std.encoding : codePoints;

     enum invalid = "hello\247\205\257there";

     foreach (dchar c; invalid.codePoints)
         writeln(cast(int) c);
}

// stop iterating on invalid UTF
unittest {
     import std.encoding : validLength;

     enum invalid = "hello\247\205\257there";
     char[] s;

     foreach (dchar c; invalid[0 .. invalid.validLength])
         s ~= c;
     assert(s == "hello");
}
```


More information about the Digitalmars-d mailing list