How to accelerate this program?

Mon Apr 3 07:39:41 PDT 2006

Li Jie wrote:

> At first, thank all my helpers. :)
> 
> 
> I have some email addresses in a text file, about 780000 lines. There are
> some repeated lines, about 8%, and I want to remove them.
> 
> I write a D version:
> =======================================
> import std.stdio;
> import std.string;
> import std.stdio;
> import std.string;
> import std.perf;
> 
> int main(char[][] argv)
> {
> if (argv.length < 3) {
> writefln("Wrong arguments");
> return 1;
> }
> 
> const int READ_SIZE = 1024;
> 
> FILE* fin = fopen(argv[1], "r");
> FILE* fout = fopen(argv[2], "w");
> char buffer[READ_SIZE];
> int[char[]] emails;
> 
> PerformanceCounter counter = new PerformanceCounter();
> counter.start();
> while (!feof(fin)){
> fgets(cast(char*)buffer, READ_SIZE, fin);
> char[] email = toString(cast(char*)buffer);
> if (!(email in emails)){
> emails[toString(buffer)] = 0;
> fputs(cast(char*)email, fout);
> }
> }
> 
> fclose(fout);
> fclose(fin);
> counter.stop();
> 
> writefln(counter.milliseconds());
> return 0;
> }
> ======================================
> 
> It used 1080 ms on my computer.
> 
> A optimized c++ version:
> ======================================
> #include <iostream>
> #include <string>
> #include <fstream>
> #include <iterator>
> #include <sys/time.h>
> #include <vector>
> using namespace std;
> 
> 
> size_t my_hash (const char* str)
> {
> size_t ret = 0;
> while (*str)
> ret = 11 * ret + *str++;
> return ret;
> }
> 
> class Email
> {
> private:
> size_t hash;
> const char* mail;
> friend bool operator < (const Email& lhs, const Email& rhs);
> public:
> Email (const char* mail_)
> : hash(my_hash(mail_)), mail(mail_)
> {
> }
> 
> bool operator == (const Email& rhs)
> {
> if (hash == rhs.hash)
> return strcmp(mail, rhs.mail) == 0;
> return false;
> }
> 
> const char* getEmail()const
> {
> return mail;
> }
> };
> 
> bool operator < (const Email& lhs, const Email& rhs)
> {
> if (lhs.hash == rhs.hash)
> return strcmp(lhs.mail, rhs.mail) < 0;
> return lhs.hash < rhs.hash;
> }
> 
> int main(int argc, char** argv)
> {
> if (argc < 3)
> {
> cout << "Wrong arguments" << endl;
> return 1;
> }
> 
> FILE* fin = fopen(argv[1], "r");
> if (!fin)
> {
> cout << "Invalid input file" << endl;
> return 2;
> }
> FILE* fout = fopen(argv[2], "w");
> if (!fout)
> {
> fclose(fin);
> cout << "Invalid output file" << endl;
> return 3;
> }
> 
> timeval start, end;
> 
> const int BUF_SIZE = 20 * 1024 * 1024;
> char* buffer = new char[BUF_SIZE];
> memset(buffer, 0, BUF_SIZE);
> 
> gettimeofday(&start, 0);
> vector<Email> emails;
> 
> size_t read = fread (buffer, BUF_SIZE, 1, fin);
> char* begin = buffer;
> char* current = buffer;
> 
> while (*current != '\0')
> {
> if (*current == '\n')
> {
> *current = '\0';
> emails.push_back(begin);
> begin = current + 1;
> }
> ++ current;
> }
> fclose(fin);
> sort(emails.begin(), emails.end());
> emails.erase (unique( emails.begin(), emails.end() ), emails.end());
> 
> for (vector<Email>::const_iterator iter = emails.begin();
> iter != emails.end();
> iter ++)
> {
> fputs((*iter).getEmail(), fout);
> fwrite("\n", 1, 1, fout);
> }
> 
> fclose(fout);
> 
> gettimeofday(&end, 0);
> 
> printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 +
> (end.tv_usec - start.tv_usec) / 1000));
> 
> delete[] buffer;
> 
> return 0;
> }
> =====================================================
> It used 680 ms.
> 
> I modified the D version to:
> ====================================================
> import std.stdio;
> import std.string;
> import std.perf;
> 
> int main(char[][] argv)
> {
> if (argv.length < 3) {
> writefln("Wrong arguments");
> return 1;
> }
> 
> const int BUF_SIZE = 20 * 1024 * 1024;
> char* buffer = new char[BUF_SIZE];
> 
> PerformanceCounter counter = new PerformanceCounter();
> counter.start();
> 
> FILE* fin = fopen(argv[1], "r");
> FILE* fout = fopen(argv[2], "w");
> 
> fread(buffer, BUF_SIZE, 1, fin);
> fclose(fin);
> 
> int[char[]] emails;
> char* begin = buffer;
> char* current = begin;
> char* newline = cast(char*)"\n";
> 
> counter.stop();
> writefln(counter.milliseconds());
> 
> while (*current != '\0')
> {
> if (*current == '\n')
> {
> *current = '\0';
> char[] email = toString(begin);
> if (!(email in emails))
> {
> emails[email] = 0;
> fputs (begin, fout);
> fwrite(newline, 1, 1, fout);
> }
> begin = current + 1;
> }
> ++ current;
> }
> 
> fclose(fout);
> counter.stop();
> 
> writefln(counter.milliseconds());
> 
> delete buffer;
> return 0;
> }
> ====================================================
> 
> But it used 3600 ms... :(
> 
> 
> I don't know how to accelerate it in D. :(
> Can you help me?

I would recommend using the Mango library and it's textual iterators.  See
the example at:
http://www.dsource.org/projects/mango/browser/trunk/example/lineio.d

A few tweaks to that example should get you on your way.

Also, when compiling, make sure to use the -O -inlinc -release flags.  -O
optimizes, -inline allows smaller functions to be inlined, and -release
removes things like bound checking.  I've found that using these flags
improves performance significantly.

~John Demme