Accessing non-binary Unicode properties with std.uni

Tue Sep 29 17:04:51 UTC 2020

On Tue, Sep 29, 2020 at 04:22:18PM +0000, Dukc via Digitalmars-d-learn wrote:
> On Monday, 28 September 2020 at 18:23:43 UTC, Chloé Kekoa wrote:
> > The documentation of std.uni [1] says that the unicode struct
> > provides sets for several binary properties. I am looking for a way
> > to query non-binary properties of a character. Is that possible with
> > std.uni or do I need to use a third-party library?
> > 
> > I am specifically interested in the East_Asian_Width property [2]
> > (which has six allowed values). Trying to access
> > std.uni.unicode.East_Asian_Width results in the error message:
> > 
> > > No unicode set by name East_Asian_Width was found.
> > 
> > [1]: https://dlang.org/library/std/uni.html
> > [2]: https://www.unicode.org/reports/tr11/tr11-38.html
> 
> It seems the East Asian width is Unicode standard 13.0, while Phobos
> implements 6.2. So seems like ca case for a third-party library :(.
[...]

OTOH, the relevant Unicode data file that contains East_Asian_Width data
(EastAsianWidth.txt) is relatively straightforward to parse.  In one of
my projects, I wrote a little helper program to parse this file and
generate a function that tells me if a given dchar is wide or narrow.

Here's the generated function (just copy-n-paste this into your code, no
need for yet another external library dependency):

	bool isWide(dchar ch) @safe pure nothrow @nogc
	{
	    if (ch < 63744)
	    {
		if (ch < 12880)
		{
		    if (ch < 11904)
		    {
			if (ch < 4352) return false;
			if (ch < 4448) return true;
			if (ch == 9001 || ch == 9002) return true;
			return false;
		    }
		    else if (ch < 12351) return true;
		    else
		    {
			if (ch < 12353) return false;
			if (ch < 12872) return true;
			return false;
		    }
		}
		else if (ch < 19904) return true;
		else
		{
		    if (ch < 43360)
		    {
			if (ch < 19968) return false;
			if (ch < 42183) return true;
			return false;
		    }
		    else if (ch < 43389) return true;
		    else
		    {
			if (ch < 44032) return false;
			if (ch < 55204) return true;
			return false;
		    }
		}
	    }
	    else if (ch < 64256) return true;
	    else
	    {
		if (ch < 65504)
		{
		    if (ch < 65072)
		    {
			if (ch < 65040) return false;
			if (ch < 65050) return true;
			return false;
		    }
		    else if (ch < 65132) return true;
		    else
		    {
			if (ch < 65281) return false;
			if (ch < 65377) return true;
			return false;
		    }
		}
		else if (ch < 65511) return true;
		else
		{
		    if (ch < 127488)
		    {
			if (ch == 110592 || ch == 110593) return true;
			return false;
		    }
		    else if (ch < 127570) return true;
		    else
		    {
			if (ch < 131072) return false;
			if (ch < 262142) return true;
			return false;
		    }
		}
	    }
	}

Here's the utility that generated this code:

	/**
	 * Simple program to parse EastAsianWidth.txt to extract some useful info.
	 */

	import std.algorithm;
	import std.conv;
	import std.range;
	import std.regex;
	import std.stdio;

	struct CodeRange
	{
	    dchar start, end;

	    bool overlaps(CodeRange cr)
	    {
	        return ((start >= cr.start && start < cr.end) ||
	                (end >= cr.start && end < cr.end));
	    }

	    unittest
	    {
	        assert(CodeRange(1,11).overlaps(CodeRange(11,12)));
	        assert(!CodeRange(1,10).overlaps(CodeRange(11,12)));
	    }

	    void merge(CodeRange cr)
	    {
	        start = min(start, cr.start);
	        end = max(end, cr.end);
	    }

	    unittest
	    {
	        auto cr = CodeRange(10,20);
	        cr.merge(CodeRange(20,30));
	        assert(cr == CodeRange(10,30));
	    }

	    void toString(scope void delegate(const(char)[]) sink)
	    {
	        import std.format : formattedWrite;
	        sink.formattedWrite("%04X", start);
	        if (end > start+1)
	            sink.formattedWrite("..%04X", end-1);
	    }
	}

	struct Entry
	{
	    CodeRange range;
	    string width;

	    void toString(scope void delegate(const(char)[]) sink)
	    {
	        import std.format : formattedWrite;
	        sink.formattedWrite("%s;%s", range, width);
	    }
	}

	/**
	 * Returns: An input range of Entry objects.
	 */
	auto parse(R)(R input)
	    if (isInputRange!R && is(ElementType!R : const(char)[]))
	{
	    // For our purposes, we don't need to distinguish between explicit/implicit
	    // narrowness, and ambiguous cases can just default to narrow. So we map
	    // the original width to its equivalent using the following equivalence
	    // table.
	    string[string] equivs = [
	        "Na" : "N",
	        "N"  : "N",
	        "H"  : "N",
	        "A"  : "N",
	        "W"  : "W",
	        "F"  : "W"
	    ];

	    auto reEmpty = regex(`^\s*$`);
	    auto reSingle = regex(`^([0-9A-F]+);(N|A|H|W|F|Na)\b`);
	    auto reRange = regex(`^([0-9A-F]+)\.\.([0-9A-F]+);(N|A|H|W|F|Na)\b`);

	    struct Result
	    {
	        R     range;
	        Entry front;
	        bool  empty;

	        this(R _range)
	        {
	            range = _range;
	            next(); // get things started
	        }

	        void next()
	        {
	            while (!range.empty)
	            {
	                auto line = range.front;

	                if (auto m = line.match(reSingle))
	                {
	                    auto width = equivs[m.captures[2]];
	                    dchar ch = cast(dchar) m.captures[1].to!int(16);
	                    front = Entry(CodeRange(ch, ch+1), width);
	                    empty = false;
	                    return;
	                }
	                else if (auto m = line.match(reRange))
	                {
	                    auto width = equivs[m.captures[3]];
	                    dchar start = cast(dchar) m.captures[1].to!int(16);
	                    dchar end = cast(dchar) m.captures[2].to!int(16) + 1;
	                    front = Entry(CodeRange(start, end), width);
	                    empty = false;
	                    return;
	                }
	                else if (!line.startsWith("#") && !line.match(reEmpty))
	                {
	                    import std.string : format;
	                    throw new Exception("Couldn't parse line:\n%s"
	                                        .format(line));
	                }

	                range.popFront();
	            }
	            empty = true;
	        }

	        void popFront()
	        {
	            range.popFront();
	            next();
	        }
	    }
	    static assert(isInputRange!Result);

	    return Result(input);
	}

	void outputByWidthType(R)(R input)
	    if (isInputRange!R && is(ElementType!R : const(char)[]))
	{
	    CodeRange[][string] widths;
	    string lastWidth;

	    void addRange(Entry entry)
	    {
	        auto range = entry.range;
	        auto width = entry.width;
	        auto ranges = width in widths;
	        if (ranges && ranges.length > 0 && width == lastWidth)
	        {
	            (*ranges)[$-1].merge(range);
	        }
	        else
	            widths[width] ~= range;

	        lastWidth = width;
	    }

	    foreach (entry; input.parse())
	    {
	         addRange(entry);
	    }

	    foreach (width; widths.byKey())
	    {
	        writeln("# ", width);
	        foreach (range; widths[width])
	        {
	            writefln("%s;%s", range, width);
	        }
	        writeln();
	    }
	}

	/**
	 * Returns: An input range of Entry objects.
	 */
	auto mergeConsecutive(R)(R input)
	    if (isInputRange!R && is(ElementType!R : Entry))
	{
	    struct Result
	    {
	        R     range;
	        bool  empty;
	        Entry front;
	        Entry current;

	        this(R _range)
	        {
	            range = _range;
	            next();
	        }

	        void next()
	        {
	            while (!range.empty)
	            {
	                auto e = range.front;
	                if (current.width != e.width)
	                {
	                    if (current.width != "")
	                    {
	                        empty = false;
	                        front = current;

	                        current = e;
	                        range.popFront();

	                        //writefln("Yielding: %s", front);
	                        return;
	                    }
	                    current = e;
	                }
	                else
	                {
	                    //writefln("Merging: %s with %s", current, e);
	                    current.range.merge(e.range);
	                }

	                range.popFront();
	            }

	            if (current.width != "")
	            {
	                empty = false;
	                front = current;
	            }
	            else
	                empty = true;
	        }

	        void popFront()
	        {
	            if (range.empty)
	                empty = true; // on last element
	            else
	                next();
	        }
	    }

	    return Result(input);
	}

	void outputByCodePoint(R)(R input)
	    if (isInputRange!R && is(ElementType!R : const(char)[]))
	{
	    writefln("%(%s\n%)", input.parse().mergeConsecutive());
	}

	void tally(R)(R input)
	    if (isInputRange!R && is(ElementType!R : const(char)[]))
	{
	    int totalW, totalN;

	    foreach (e; input.parse().mergeConsecutive())
	    {
	        if (e.width=="W")
	            totalW += (e.range.end - e.range.start);
	        else if (e.width=="N")
	            totalN += (e.range.end - e.range.start);
	        else
	            assert(0);
	    }
	    writefln("Tally: W=%d N=%d\n", totalW, totalN);
	}

	void genRecogCode(R)(R input)
	    if (isInputRange!R && is(ElementType!R : const(char)[]))
	{
	    import std.uni;

	    CodepointSet wideChars;
	    foreach (e; input.parse().mergeConsecutive())
	    {
	        if (e.width=="W")
	            wideChars.add(e.range.start, e.range.end);
	    }

	    writeln(wideChars.toSourceCode("isWide"));
	}

	int main(string[] args)
	{
	    if (args.length < 2)
	    {
	        assert(args.length > 0);
	        stderr.writefln("Usage: %s (bywidth|bypoint|tally|gencode)", args[0]);
	        return 1;
	    }

	    auto input = File("ext/EastAsianWidth.txt", "r").byLine();

	    auto cmd = args[1];
	    switch (cmd)
	    {
	        case "bywidth":
	            outputByWidthType(input);
	            break;

	        case "bypoint":
	            outputByCodePoint(input);
	            break;

	        case "tally":
	            tally(input);
	            break;

	        case "gencode":
	            genRecogCode(input);
	            break;

	        default:
	            stderr.writefln("Unknown command: %s", cmd);
	            return 1;
	    }
	    return 0;
	}

T

-- 
People tell me that I'm skeptical, but I don't believe them.