/* Copyright (c) 2007, Stéphan Kochen All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Mr. Kochen nor the names of the contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ module unitools; /* This needs UnicodeData.txt from the Unicode Character Database. Latest stable: http://www.unicode.org/Public/UNIDATA/ Easiest is to just put it alongside this source file and compile as: dmd -J. unitools.d */ import std.stdio; // Strip white space and comments from a line private string stripCommentAndWhitespace(string s) { uint start = 0; uint end = 0; // Find the first non-whitespace character while (start < s.length && (s[start] == '\n' || s[start] == '\t' || s[start] == ' ') ) start++; // Empty line if (start >= s.length) return ""; // Look for a comment end = start; while (end < s.length && s[end] != '#') end++; // Find the last non-whitespace and non-comment character while (end > start && (s[end - 1] == '\n' || s[end - 1] == '\t' || s[end - 1] == ' ') ) end--; return s[start .. end]; } // Convert a hexadecimal string to an unsigned integer private uint hexToUInt(string hex) { char c = hex[$-1]; uint val = 0; if (c >= '0' && c <= '9') val = c - '0'; else if (c >= 'a' && c <= 'f') val = c - 'a' + 10; else if (c >= 'A' && c <= 'F') val = c - 'A' + 10; if (hex.length == 1) return val; else return val + (hexToUInt(hex[0 .. $-1]) << 4); } // Stores the raw line length, a range of code points and their property fields private struct RowData { uint len = 0; uint start = 0; uint end = 0; string[] fields = []; } // Parses a line of an UCD CSV, where the first column is a code point or range private RowData parseUnicodeCSV(string csv) { RowData retval = RowData.init; // Get the next non-empty line string line = ""; uint start = 0; foreach (i, c; csv) { if (c == '\n') { line = stripCommentAndWhitespace(csv[start .. i]); // Line is not empty? if (line.length > 0) { retval.len = i + 1; break; } // Continue with the next line start = i + 1; } } // Did we run until EOS? if (retval.len == 0) { // See if there's a final line without a line terminator if (start >= line.length) return retval; line = stripCommentAndWhitespace(csv[start .. $]); if (line.length > 0) // .. there is retval.len = csv.length; else return retval; } // Walk the line scanning for fields uint j = 0; uint fieldno = 0; foreach (i, c; line) { if (c == ';') { string field = line[j .. i]; j = i + 1; // First field contains the range if (fieldno == 0) { uint sepidx = 0; while (sepidx < field.length && field[sepidx] != '.') sepidx++; retval.start = hexToUInt(field[0 .. sepidx]); if (sepidx >= field.length) { retval.end = retval.start; } else { sepidx += 2; retval.end = hexToUInt(field[sepidx .. $]); } } // Other fields are properties else { retval.fields ~= [field]; } fieldno++; } } return retval; } public enum GeneralCategory { Lu, // Letter, uppercase Ll, // Letter, lowercase Lt, // Letter, titlecase Lm, // Letter, modifier Lo, // Letter, other Mn, // Mark, nonspacing Mc, // Mark, spacing combining Me, // Mark, enclosing Nd, // Number, decimal digit Nl, // Number, letter No, // Number, other Pc, // Punctuation, connector Pd, // Punctuation, dash Ps, // Punctuation, open Pe, // Punctuation, close Pi, // Punctuation, initial quote (may behave like Ps or Pe depending on usage) Pf, // Punctuation, final quote (may behave like Ps or Pe depending on usage) Po, // Punctuation, other Sm, // Symbol, math Sc, // Symbol, currency Sk, // Symbol, modifier So, // Symbol, other Zs, // Separator, space Zl, // Separator, line Zp, // Separator, paragraph Cc, // Other, control Cf, // Other, format Cs, // Other, surrogate Co, // Other, private use Cn // Other, not assigned (including noncharacters) } private string uintToString(uint i) { if (i < 10) return [cast(char) (i + '0')]; else return uintToString(i / 10) ~ uintToString(i % 10); } private string genSingleCategoryIf(string prop, uint start, uint end) { if (start == end) { return "if (c == " ~ uintToString(start) ~ ")\n" ~ " return GeneralCategory." ~ prop ~ ";\n" ~ "else "; } else { return "if (c >= " ~ uintToString(start) ~ " && " ~ "c <= " ~ uintToString(end) ~ ")\n" ~ " return GeneralCategory." ~ prop ~ ";\n" ~ "else "; } } private string genCategoryIfstatement(string csv) { RowData row = parseUnicodeCSV(csv); if (row.len < 1 || row.fields.length < 2) return ""; // Instead of a humongeous if-statement, group consecutive ranges of the // same general category. string retval = ""; string propcur = row.fields[1]; uint propstart = row.start; uint propend = row.end; uint csvpos = row.len; while ((row = parseUnicodeCSV(csv[csvpos .. $])).len > 0) { csvpos += row.len; // Not enough properties, bail if (row.fields.length < 2) return ""; // Category is the same as the last? if (row.fields[1] == propcur) { // Overlapping or adjacent to the last range? if (!(row.end < propstart - 1 || row.start > propend + 1)) { // Merge with last if (row.start < propstart) propstart = row.start; if (row.end > propend) propend = row.end; continue; } } // Didn't 'continue' // Generate if-statement for last retval ~= genSingleCategoryIf(propcur, propstart, propend); // Start working on the next range propcur = row.fields[1]; propstart = row.start; propend = row.end; } // Generate if-statement for last and return return retval ~ genSingleCategoryIf(propcur, propstart, propend) ~ "\n return GeneralCategory.Cn;\n"; } public GeneralCategory getGeneralCategory(dchar c) { mixin(genCategoryIfstatement(import("UnicodeData.txt"))); } export int main(char[][] args) { if (getGeneralCategory(' ') == GeneralCategory.Zs) writefln("Yay!"); else writefln("Boo!"); return 0; } /+ export int main(char[][] args) { writefln("%s", genCategoryIfstatement(import("UnicodeData.txt"))); return 0; } +/