#!/home/aalexandre/dmd2/linux/bin/rdmd import std.algorithm, std.contracts, std.conv, std.ctype, std.file, std.getopt, std.path, std.range, std.stdio, std.string, std.traits; uint bug = 1; void main(string args[]) { getopt(args, "bug", &bug); enforce(bug <= 2); auto txt = readText("data.html"); untag(txt, "data.html"); } void untag(string txt, string filename) { string currentParagraph; // Find beginning of content txt = std.algorithm.find(txt, "\n"); // Ancillary function that commits the current paragraph for // writing void commit() { writeParagraph(strip(currentParagraph)); currentParagraph.length = 0; } void writeChar(dchar c) { immutable lastWritten = currentParagraph.length ? currentParagraph.back : dchar.init; if (lastWritten == ' ' && c == ' ') { // Two consecutive spaces fused } else { // Normal case currentParagraph ~= c; } } void writeWords(string s) { if (bug == 0) { foreach (dchar c; s) { currentParagraph ~= c; } } else if (bug == 1) { reserve(currentParagraph, currentParagraph.length + s.length); currentParagraph ~= s; } else { currentParagraph = currentParagraph ~ s; } } // Parse the content while (!txt.empty) { size_t i = 0; while (i < txt.length && txt[i] != '<' && txt[i] != '&') { ++i; } writeWords(txt[0 .. i]); if (i == txt.length) { commit(); return; } txt = txt[i .. $]; auto c = txt[0]; txt = txt[1 .. $]; if (c == '<') { // This is a tag if (startsWithConsume(txt, `/p>`) || startsWithConsume(txt, `/li>`)) { // End of paragraph commit(); } else { // This is an uninteresting tag enforce(findConsume(txt, '>'), "Could not find closing tag: "~txt); } } else { string code; findConsume(txt, ';', appender(&code)); switch (code) { case "#160;": case "#32;": case "reg;": case "nbsp;": writeChar(' '); break; case "amp;": writeChar('&'); break; case "gt;": writeChar('>'); break; case "lt;": writeChar('<'); break; case "quot;": writeChar('"'); break; default: throw new Exception(text("Unknown code: &", code)); break; } } } } void writeParagraph(string sentence) { static bool isSeparator(dchar a) { return !(isalpha(a) /*|| a == '.'*/); } foreach (string cand; std.algorithm.splitter!isSeparator(sentence)) { cand = tolower(cand); } } /** If $(D r2) can not be found in $(D r1), leave $(D r1) unchanged and return $(D false). Otherwise, consume elements in $(D r1) until $(D startsWithConsume(r1, r2)), and return $(D true). Effectively positions $(D r1) right after $(D r2). */ bool findConsume(R1, R2)(ref R1 r1, R2 r2) if (isForwardRange!R2) { auto r = r1; // .save(); while (!r.empty) { if (std.algorithm.startsWithConsume(r, r2)) { r1 = r; return true; } r.popFront(); } return false; } /** If $(D r2) can not be found in $(D r1), leave $(D r1) unchanged and return $(D false). Otherwise, consume elements in $(D r1) until $(D startsWith(r1, r2)), and return $(D true). */ bool findConsume(R, E)(ref R r, E e) if (is(typeof(r.front == e))) { auto r1 = std.algorithm.find(r, e); if (r1.empty) return false; r = r1; r.popFront(); return true; } /** If $(D r2) can not be found in $(D r1), leave $(D r1) unchanged and return $(D false). Otherwise, consume elements in $(D r1) until $(D startsWith(r1, r2)), and return $(D true). */ bool findConsume(R1, E, R2)(ref R1 r1, E e, R2 r2) if (is(typeof(r1.front == e))) { auto r = r1; while (!r.empty) { r2.put(r.front); if (r.front == e) { r.popFront(); r1 = r; return true; } r.popFront(); } return false; }