how to localize console and GUI apps in Windows

Thu Dec 28 18:45:39 UTC 2017

On Thu, Dec 28, 2017 at 05:56:32PM +0000, Andrei via Digitalmars-d-learn wrote:
> There is one everlasting problem writing Cyrillic programs in Windows:
> Microsoft consequently invented two much different code pages for
> Russia and other Cyrillic-alphabet countries: first was MSDOS-866 (and
> alike), second Windows-1251. Nowadays MS Windows uses first code page
> for console programs, second for GUI applications, and there always
> are many workarounds to get proper translation between them. Mostly a
> programmer should write program sources either in one code page for
> console and other for GUI, or use .NET, which basically uses UTF8 in
> sources and makes seamless translation depending on back end.
> 
> In D language which uses only UTF8 for string encoding I cannot write
> neither MS866 code page program texts, nor Windows-1251 - both cases
> end in a compiler error like "Invalid trailing code unit" or "Outside
> Unicode code space". And writing Cyrillic strings in UTF8 format is
> fatal for both console and GUI Windows targets.
> 
> My question is: is there any standard means to translate Cyrillic or
> any other localized UTF8 strings for console and GUI output in D
> libraries. If so - where I can get more information and good example.
> Google would not help.
[...]

The string / wstring / dstring types in D are intended to be Unicode
strings.  If you need to use other encodings, you really should be using
ubyte[] or const(ubyte)[] or immutable(ubyte)[], instead of string.

One approach is to use UTF-8 in your code, and only translate to one of
the code pages when you need to produce output.  I wrote a small module
for translating to/from KOI8-R when dealing with Russian text; you might
find it helpful:

-------------------------------------------------------------------------------
/**
 * Module to convert between UTF and KOI8-R
 */
module koi8r;

import std.string;
import std.range;

static immutable ubyte[0x450 - 0x410] utf2koi8r = [
    225, 226, 247, 231, 228, 229, 246, 250, // АБВГДЕЖЗ
    233, 234, 235, 236, 237, 238, 239, 240, // ИЙКЛМНОП
    242, 243, 244, 245, 230, 232, 227, 254, // РСТУФХЦЧ
    251, 253, 255, 249, 248, 252, 224, 241, // ШЩЪЫЬЭЮЯ
    193, 194, 215, 199, 196, 197, 214, 218, // абвгдежз
    201, 202, 203, 204, 205, 206, 207, 208, // ийклмноп
    210, 211, 212, 213, 198, 200, 195, 222, // рстуфхцч
    219, 221, 223, 217, 216, 220, 192, 209  // шщъыьэюя
];

/**
 * Translates a range of UTF characters into KOI8-R characters.
 * Returns: Range of KOI8-R characters (as ubyte).
 */
auto toKOI8r(R)(R range)
    if (isInputRange!R && is(ElementType!R : dchar))
{
    static struct Result
    {
        R _range;

        @property bool empty() { return _range.empty; }

        @property ubyte front()
        {
            dchar ch = _range.front;

            // ASCII
            if (ch < 128)
                return cast(ubyte)ch;

            // Primary alphabetic range
            if (ch >= 0x410 && ch < 0x450)
                return utf2koi8r[ch - 0x410];

            // Special case: Ё and ё are outside the usual range.
            if (ch == 0x401) return 179;
            if (ch == 0x451) return 163;

            throw new Exception(
                "Encoding error: unable to convert '%c' to KOI8-R".format(ch));
        }

        void popFront() { _range.popFront(); }

        static if (isForwardRange!R)
        {
            @property Result save()
            {
                Result copy;
                copy._range = _range.save;
                return copy;
            }
        }
    }
    return Result(range);
}

unittest
{
    import std.string;
    import std.algorithm : equal;

    assert("юабцдефгхийклмнопярстужвьызшэщчъ".toKOI8r.equal(iota(192, 224)));
    assert("ЮАБЦДЕФГХИЙКЛМНОПЯРСТУЖВЬЫЗШЭЩЧЪ".toKOI8r.equal(iota(224, 256)));
}

unittest
{
    auto r = "abc абв".toKOI8r;
    static assert(isForwardRange!(typeof(r)));
    import std.algorithm.comparison : equal;
    assert(r.equal(['a', 'b', 'c', ' ', 193, 194, 215]));
}

static dchar[0x100 - 0xC0] koi8r2utf = [
    'ю', 'а', 'б', 'ц', 'д', 'е', 'ф', 'г', // 192-199
    'х', 'и', 'й', 'к', 'л', 'м', 'н', 'о', // 200-207
    'п', 'я', 'р', 'с', 'т', 'у', 'ж', 'в', // 208-215
    'ь', 'ы', 'з', 'ш', 'э', 'щ', 'ч', 'ъ', // 216-223
    'Ю', 'А', 'Б', 'Ц', 'Д', 'Е', 'Ф', 'Г', // 224-231
    'Х', 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', // 232-239
    'П', 'Я', 'Р', 'С', 'Т', 'У', 'Ж', 'В', // 240-247
    'Ь', 'Ы', 'З', 'Ш', 'Э', 'Щ', 'Ч', 'Ъ'  // 248-255
];

/**
 * Translates a range of KOI8-R characters to UTF.
 * Returns: Range of UTF characters (as dchar).
 */
auto fromKOI8r(R)(R range)
    if (isInputRange!R && is(ElementType!R : ubyte))
{
    static struct Result
    {
        R _range;
        @property bool empty() { return _range.empty; }
        @property dchar front()
        {
            ubyte b = _range.front;
            if (b < 128) return b;
            if (b >= 192)
                return koi8r2utf[b - 192];

            switch (b)
            {
                case 128: return '─';
                case 152: return '≤';
                case 153: return '≥';
                case 163: return 'ё';
                case 179: return 'Ё';
                default:
                    import std.string : format;
                    throw new Exception(
                        "KOI8-R character %d not implemented yet".format(b));
            }
        }
        void popFront() { _range.popFront(); }
        static if (isForwardRange!R)
        {
            @property Result save()
            {
                Result copy;
                copy._range = _range.save;
                return copy;
            }
        }
    }
    return Result(range);
}

unittest
{
    import std.algorithm.comparison : equal;
    ubyte[] lower = [
        193, 194, 215, 199, 196, 197, 163, 214,
        218, 201, 202, 203, 204, 205, 206, 207,
        208, 210, 211, 212, 213, 198, 200, 195,
        222, 219, 221, 223, 217, 216, 220, 192,
        209
    ];
    assert(lower.fromKOI8r.equal("абвгдеёжзийклмнопрстуфхцчшщъыьэюя"));

    ubyte[] upper = [
        225, 226, 247, 231, 228, 229, 179, 246,
        250, 233, 234, 235, 236, 237, 238, 239,
        240, 242, 243, 244, 245, 230, 232, 227,
        254, 251, 253, 255, 249, 248, 252, 224,
        241
    ];
    assert(upper.fromKOI8r.equal("АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"));
}
-------------------------------------------------------------------------------

As the unittests show, you just call toKOI8r or fromKOI8r to translate
between encodings.  All non-Unicode strings are traded as ubyte[], so
that you won't accidentally mix up a Unicode string with a KOI8-R string.

And the code should be straightforward enough to be adapted for other
encodings as well.

Hope this helps.

T

-- 
For every argument for something, there is always an equal and opposite argument against it. Debates don't give answers, only wounded or inflated egos.