/*
Copyright (c) 2007, Stéphan Kochen <stephan@kochen.nl>

All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice,
      this list of conditions and the following disclaimer in the documentation
      and/or other materials provided with the distribution.
    * Neither the name of Mr. Kochen nor the names of the contributors may be
      used to endorse or promote products derived from this software without
      specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

module unitools;

/*
This needs UnicodeData.txt from the Unicode Character Database. Latest stable:
http://www.unicode.org/Public/UNIDATA/

Easiest is to just put it alongside this source file and compile as:
dmd -J. unitools.d
*/

import std.stdio;

// Strip white space and comments from a line
private string stripCommentAndWhitespace(string s)
{
    uint start = 0;
    uint end = 0;
    
    // Find the first non-whitespace character
    while (start < s.length && 
           (s[start] == '\n' || s[start] == '\t' || s[start] == ' ')
          ) start++;
    // Empty line
    if (start >= s.length)
        return "";
    
    // Look for a comment
    end = start;
    while (end < s.length && s[end] != '#') end++;
    
    // Find the last non-whitespace and non-comment character
    while (end > start &&
           (s[end - 1] == '\n' || s[end - 1] == '\t' || s[end - 1] == ' ')
          ) end--;
    
    return s[start .. end];
}

// Convert a hexadecimal string to an unsigned integer
private uint hexToUInt(string hex)
{
    char c = hex[$-1];
    uint val = 0;
    if (c >= '0' && c <= '9')
        val = c - '0';
    else if (c >= 'a' && c <= 'f')
        val = c - 'a' + 10;
    else if (c >= 'A' && c <= 'F')
        val = c - 'A' + 10;
    if (hex.length == 1)
        return val;
    else
        return val + (hexToUInt(hex[0 .. $-1]) << 4);
}

// Stores the raw line length, a range of code points and their property fields
private struct RowData
{
    uint len = 0;
    
    uint start = 0;
    uint end = 0;
    
    string[] fields = [];
}

// Parses a line of an UCD CSV, where the first column is a code point or range
private RowData parseUnicodeCSV(string csv)
{
    RowData retval = RowData.init;
    
    // Get the next non-empty line
    string line = "";
    uint start = 0;
    foreach (i, c; csv)
    {
        if (c == '\n')
        {
            line = stripCommentAndWhitespace(csv[start .. i]);
            // Line is not empty?
            if (line.length > 0)
            {
                retval.len = i + 1;
                break;
            }
            
            // Continue with the next line
            start = i + 1;
        }
    }
    // Did we run until EOS?
    if (retval.len == 0)
    {
        // See if there's a final line without a line terminator
        if (start >= line.length)
            return retval;
        line = stripCommentAndWhitespace(csv[start .. $]);
        if (line.length > 0)
            // .. there is
            retval.len = csv.length;
        else
            return retval;
    }
    
    // Walk the line scanning for fields
    uint j = 0;
    uint fieldno = 0;
    foreach (i, c; line)
    {
        if (c == ';')
        {
            string field = line[j .. i];
            j = i + 1;
            
            // First field contains the range
            if (fieldno == 0)
            {
                uint sepidx = 0;
                while (sepidx < field.length && field[sepidx] != '.') sepidx++;
                retval.start = hexToUInt(field[0 .. sepidx]);
                
                if (sepidx >= field.length)
                {
                    retval.end = retval.start;
                }
                else
                {
                    sepidx += 2;
                    retval.end = hexToUInt(field[sepidx .. $]);
                }
            }
            // Other fields are properties
            else
            {
                retval.fields ~= [field];
            }
            fieldno++;
        }
    }
    
    return retval;
}


public enum GeneralCategory
{
    Lu, // Letter, uppercase
    Ll, // Letter, lowercase
    Lt, // Letter, titlecase
    Lm, // Letter, modifier
    Lo, // Letter, other
    Mn, // Mark, nonspacing
    Mc, // Mark, spacing combining
    Me, // Mark, enclosing
    Nd, // Number, decimal digit
    Nl, // Number, letter
    No, // Number, other
    Pc, // Punctuation, connector
    Pd, // Punctuation, dash
    Ps, // Punctuation, open
    Pe, // Punctuation, close
    Pi, // Punctuation, initial quote (may behave like Ps or Pe depending on usage)
    Pf, // Punctuation, final quote (may behave like Ps or Pe depending on usage)
    Po, // Punctuation, other
    Sm, // Symbol, math
    Sc, // Symbol, currency
    Sk, // Symbol, modifier
    So, // Symbol, other
    Zs, // Separator, space
    Zl, // Separator, line
    Zp, // Separator, paragraph
    Cc, // Other, control
    Cf, // Other, format
    Cs, // Other, surrogate
    Co, // Other, private use
    Cn  // Other, not assigned (including noncharacters)
}

private string uintToString(uint i)
{
    if (i < 10)
	    return [cast(char) (i + '0')];
    else
    	return uintToString(i / 10) ~ uintToString(i % 10);
}

private string genSingleCategoryIf(string prop, uint start, uint end)
{
    if (start == end)
    {
        return "if (c == " ~ uintToString(start) ~ ")\n" ~
               "    return GeneralCategory." ~ prop ~ ";\n" ~
               "else ";
    }
    else
    {
        return "if (c >= " ~ uintToString(start) ~ " && " ~
               "c <= " ~ uintToString(end) ~ ")\n" ~
               "    return GeneralCategory." ~ prop ~ ";\n" ~
               "else ";
    }
}

private string genCategoryIfstatement(string csv)
{
    RowData row = parseUnicodeCSV(csv);
    if (row.len < 1 || row.fields.length < 2)
        return "";
    
    // Instead of a humongeous if-statement, group consecutive ranges of the
    // same general category.
    
    string retval = "";
    string propcur = row.fields[1];
    uint propstart = row.start;
    uint propend = row.end;
    uint csvpos = row.len;
    
    while ((row = parseUnicodeCSV(csv[csvpos .. $])).len > 0)
    {
        csvpos += row.len;
        
        // Not enough properties, bail
        if (row.fields.length < 2)
            return "";
        
        // Category is the same as the last?
        if (row.fields[1] == propcur)
        {
            // Overlapping or adjacent to the last range?
            if (!(row.end < propstart - 1 ||
                  row.start > propend + 1))
            {
                // Merge with last
                if (row.start < propstart)
                    propstart = row.start;
                if (row.end > propend)
                    propend = row.end;
                
                continue;
            }
        }
        // Didn't 'continue'
        
        // Generate if-statement for last
        retval ~= genSingleCategoryIf(propcur, propstart, propend);
        
        // Start working on the next range
        propcur = row.fields[1];
        propstart = row.start;
        propend = row.end;
    }

    // Generate if-statement for last and return
    return retval ~ genSingleCategoryIf(propcur, propstart, propend) ~
        "\n    return GeneralCategory.Cn;\n";
}

public GeneralCategory getGeneralCategory(dchar c)
{
    mixin(genCategoryIfstatement(import("UnicodeData.txt")));
}

export int main(char[][] args)
{
    if (getGeneralCategory(' ') == GeneralCategory.Zs)
        writefln("Yay!");
    else
        writefln("Boo!");
    return 0;
}

/+
export int main(char[][] args)
{
    writefln("%s", genCategoryIfstatement(import("UnicodeData.txt")));
    return 0;
}
+/