#!/home/aalexandre/dmd2/linux/bin/rdmd
import std.algorithm, std.contracts, std.conv, std.ctype,
    std.file, std.getopt, std.path, std.range, std.stdio,
    std.string, std.traits;

uint bug = 1;

void main(string args[]) {
    getopt(args, "bug", &bug);
    enforce(bug <= 2);
    auto txt = readText("data.html");
    untag(txt, "data.html");
}

void untag(string txt, string filename) {
    string currentParagraph;

    // Find beginning of content
    txt = std.algorithm.find(txt, "<!-- start content -->\n");

    // Ancillary function that commits the current paragraph for
    // writing
    void commit() {
        writeParagraph(strip(currentParagraph));
        currentParagraph.length = 0;
    }

    void writeChar(dchar c) {
        immutable lastWritten = currentParagraph.length
            ? currentParagraph.back
            : dchar.init;
        if (lastWritten == ' ' && c == ' ') {
            // Two consecutive spaces fused
        } else {
            // Normal case
            currentParagraph ~= c;
        }
    }
    
    void writeWords(string s) {
        if (bug == 0) {
            foreach (dchar c; s) {
                currentParagraph ~= c;
            }
        } else if (bug == 1) {
            reserve(currentParagraph, currentParagraph.length + s.length);
            currentParagraph ~= s;
        } else {
            currentParagraph = currentParagraph ~ s;
        }
    }
    
    // Parse the content
    while (!txt.empty) {
        size_t i = 0;
        while (i < txt.length && txt[i] != '<' && txt[i] != '&') {
            ++i;
        }
        writeWords(txt[0 .. i]);
        if (i == txt.length) {
            commit();
            return;
        }
        txt = txt[i .. $];
        auto c = txt[0];
        txt = txt[1 .. $];
        if (c == '<') { // This is a tag
            if (startsWithConsume(txt, `/p>`) ||
                    startsWithConsume(txt, `/li>`)) {
                // End of paragraph
                commit();
            } else {
                // This is an uninteresting tag
                enforce(findConsume(txt, '>'),
                        "Could not find closing tag: "~txt);
            }
        } else {
            string code;
            findConsume(txt, ';', appender(&code));
            switch (code) {
            case "#160;": case "#32;": case "reg;": case "nbsp;":
                writeChar(' ');
                break;
            case "amp;":
                writeChar('&');
                break;
            case "gt;":
                writeChar('>');
                break;
            case "lt;":
                writeChar('<');
                break;
            case "quot;":
                writeChar('"');
                break;
            default:
                throw new Exception(text("Unknown code: &", code));
                break;
            }
        }
    }
}

void writeParagraph(string sentence) {
    static bool isSeparator(dchar a) {
        return !(isalpha(a) /*|| a == '.'*/);
    }

    foreach (string cand; std.algorithm.splitter!isSeparator(sentence)) {
        cand = tolower(cand);
    }
}

/**
If $(D r2) can not be found in $(D r1), leave $(D r1) unchanged and
return $(D false). Otherwise, consume elements in $(D r1) until $(D
startsWithConsume(r1, r2)), and return $(D true). Effectively
positions $(D r1) right after $(D r2).
 */
bool findConsume(R1, R2)(ref R1 r1, R2 r2) if (isForwardRange!R2) {
    auto r = r1; // .save();
    while (!r.empty) {
        if (std.algorithm.startsWithConsume(r, r2)) {
            r1 = r;
            return true;
        }
        r.popFront();
    }
    return false;
}

/**
If $(D r2) can not be found in $(D r1), leave $(D r1) unchanged and
return $(D false). Otherwise, consume elements in $(D r1) until $(D
startsWith(r1, r2)), and return $(D true).
 */
bool findConsume(R, E)(ref R r, E e) if (is(typeof(r.front == e))) {
    auto r1 = std.algorithm.find(r, e);
    if (r1.empty) return false;
    r = r1;
    r.popFront();
    return true;
}

/**
If $(D r2) can not be found in $(D r1), leave $(D r1) unchanged and
return $(D false). Otherwise, consume elements in $(D r1) until $(D
startsWith(r1, r2)), and return $(D true).
 */
bool findConsume(R1, E, R2)(ref R1 r1, E e, R2 r2) if (is(typeof(r1.front == e))) {
    auto r = r1;
    while (!r.empty) {
        r2.put(r.front);
        if (r.front == e) {
            r.popFront();
            r1 = r;
            return true;
        }
        r.popFront();
    }
    return false;
}