how to localize console and GUI apps in Windows
H. S. Teoh
hsteoh at quickfur.ath.cx
Thu Dec 28 18:45:39 UTC 2017
On Thu, Dec 28, 2017 at 05:56:32PM +0000, Andrei via Digitalmars-d-learn wrote:
> There is one everlasting problem writing Cyrillic programs in Windows:
> Microsoft consequently invented two much different code pages for
> Russia and other Cyrillic-alphabet countries: first was MSDOS-866 (and
> alike), second Windows-1251. Nowadays MS Windows uses first code page
> for console programs, second for GUI applications, and there always
> are many workarounds to get proper translation between them. Mostly a
> programmer should write program sources either in one code page for
> console and other for GUI, or use .NET, which basically uses UTF8 in
> sources and makes seamless translation depending on back end.
>
> In D language which uses only UTF8 for string encoding I cannot write
> neither MS866 code page program texts, nor Windows-1251 - both cases
> end in a compiler error like "Invalid trailing code unit" or "Outside
> Unicode code space". And writing Cyrillic strings in UTF8 format is
> fatal for both console and GUI Windows targets.
>
> My question is: is there any standard means to translate Cyrillic or
> any other localized UTF8 strings for console and GUI output in D
> libraries. If so - where I can get more information and good example.
> Google would not help.
[...]
The string / wstring / dstring types in D are intended to be Unicode
strings. If you need to use other encodings, you really should be using
ubyte[] or const(ubyte)[] or immutable(ubyte)[], instead of string.
One approach is to use UTF-8 in your code, and only translate to one of
the code pages when you need to produce output. I wrote a small module
for translating to/from KOI8-R when dealing with Russian text; you might
find it helpful:
-------------------------------------------------------------------------------
/**
* Module to convert between UTF and KOI8-R
*/
module koi8r;
import std.string;
import std.range;
static immutable ubyte[0x450 - 0x410] utf2koi8r = [
225, 226, 247, 231, 228, 229, 246, 250, // АБВГДЕЖЗ
233, 234, 235, 236, 237, 238, 239, 240, // ИЙКЛМНОП
242, 243, 244, 245, 230, 232, 227, 254, // РСТУФХЦЧ
251, 253, 255, 249, 248, 252, 224, 241, // ШЩЪЫЬЭЮЯ
193, 194, 215, 199, 196, 197, 214, 218, // абвгдежз
201, 202, 203, 204, 205, 206, 207, 208, // ийклмноп
210, 211, 212, 213, 198, 200, 195, 222, // рстуфхцч
219, 221, 223, 217, 216, 220, 192, 209 // шщъыьэюя
];
/**
* Translates a range of UTF characters into KOI8-R characters.
* Returns: Range of KOI8-R characters (as ubyte).
*/
auto toKOI8r(R)(R range)
if (isInputRange!R && is(ElementType!R : dchar))
{
static struct Result
{
R _range;
@property bool empty() { return _range.empty; }
@property ubyte front()
{
dchar ch = _range.front;
// ASCII
if (ch < 128)
return cast(ubyte)ch;
// Primary alphabetic range
if (ch >= 0x410 && ch < 0x450)
return utf2koi8r[ch - 0x410];
// Special case: Ё and ё are outside the usual range.
if (ch == 0x401) return 179;
if (ch == 0x451) return 163;
throw new Exception(
"Encoding error: unable to convert '%c' to KOI8-R".format(ch));
}
void popFront() { _range.popFront(); }
static if (isForwardRange!R)
{
@property Result save()
{
Result copy;
copy._range = _range.save;
return copy;
}
}
}
return Result(range);
}
unittest
{
import std.string;
import std.algorithm : equal;
assert("юабцдефгхийклмнопярстужвьызшэщчъ".toKOI8r.equal(iota(192, 224)));
assert("ЮАБЦДЕФГХИЙКЛМНОПЯРСТУЖВЬЫЗШЭЩЧЪ".toKOI8r.equal(iota(224, 256)));
}
unittest
{
auto r = "abc абв".toKOI8r;
static assert(isForwardRange!(typeof(r)));
import std.algorithm.comparison : equal;
assert(r.equal(['a', 'b', 'c', ' ', 193, 194, 215]));
}
static dchar[0x100 - 0xC0] koi8r2utf = [
'ю', 'а', 'б', 'ц', 'д', 'е', 'ф', 'г', // 192-199
'х', 'и', 'й', 'к', 'л', 'м', 'н', 'о', // 200-207
'п', 'я', 'р', 'с', 'т', 'у', 'ж', 'в', // 208-215
'ь', 'ы', 'з', 'ш', 'э', 'щ', 'ч', 'ъ', // 216-223
'Ю', 'А', 'Б', 'Ц', 'Д', 'Е', 'Ф', 'Г', // 224-231
'Х', 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', // 232-239
'П', 'Я', 'Р', 'С', 'Т', 'У', 'Ж', 'В', // 240-247
'Ь', 'Ы', 'З', 'Ш', 'Э', 'Щ', 'Ч', 'Ъ' // 248-255
];
/**
* Translates a range of KOI8-R characters to UTF.
* Returns: Range of UTF characters (as dchar).
*/
auto fromKOI8r(R)(R range)
if (isInputRange!R && is(ElementType!R : ubyte))
{
static struct Result
{
R _range;
@property bool empty() { return _range.empty; }
@property dchar front()
{
ubyte b = _range.front;
if (b < 128) return b;
if (b >= 192)
return koi8r2utf[b - 192];
switch (b)
{
case 128: return '─';
case 152: return '≤';
case 153: return '≥';
case 163: return 'ё';
case 179: return 'Ё';
default:
import std.string : format;
throw new Exception(
"KOI8-R character %d not implemented yet".format(b));
}
}
void popFront() { _range.popFront(); }
static if (isForwardRange!R)
{
@property Result save()
{
Result copy;
copy._range = _range.save;
return copy;
}
}
}
return Result(range);
}
unittest
{
import std.algorithm.comparison : equal;
ubyte[] lower = [
193, 194, 215, 199, 196, 197, 163, 214,
218, 201, 202, 203, 204, 205, 206, 207,
208, 210, 211, 212, 213, 198, 200, 195,
222, 219, 221, 223, 217, 216, 220, 192,
209
];
assert(lower.fromKOI8r.equal("абвгдеёжзийклмнопрстуфхцчшщъыьэюя"));
ubyte[] upper = [
225, 226, 247, 231, 228, 229, 179, 246,
250, 233, 234, 235, 236, 237, 238, 239,
240, 242, 243, 244, 245, 230, 232, 227,
254, 251, 253, 255, 249, 248, 252, 224,
241
];
assert(upper.fromKOI8r.equal("АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"));
}
-------------------------------------------------------------------------------
As the unittests show, you just call toKOI8r or fromKOI8r to translate
between encodings. All non-Unicode strings are traded as ubyte[], so
that you won't accidentally mix up a Unicode string with a KOI8-R string.
And the code should be straightforward enough to be adapted for other
encodings as well.
Hope this helps.
T
--
For every argument for something, there is always an equal and opposite argument against it. Debates don't give answers, only wounded or inflated egos.
More information about the Digitalmars-d-learn
mailing list