How to accelerate this program?

Li Jie cpunion at gmail.com
Mon Apr 3 02:36:35 PDT 2006


At first, thank all my helpers. :)


I have some email addresses in a text file, about 780000 lines. There are some
repeated lines, about 8%, and I want to remove them.

I write a D version:
=======================================
import std.stdio;
import std.string;
import std.stdio;
import std.string;
import std.perf;

int main(char[][] argv)
{
if (argv.length < 3) {
writefln("Wrong arguments");
return 1;
}

const int READ_SIZE = 1024;

FILE* fin = fopen(argv[1], "r");
FILE* fout = fopen(argv[2], "w");
char buffer[READ_SIZE];
int[char[]] emails;

PerformanceCounter counter = new PerformanceCounter();
counter.start();
while (!feof(fin)){
fgets(cast(char*)buffer, READ_SIZE, fin);
char[] email = toString(cast(char*)buffer);
if (!(email in emails)){
emails[toString(buffer)] = 0;
fputs(cast(char*)email, fout);
}
}

fclose(fout);
fclose(fin);
counter.stop();

writefln(counter.milliseconds());
return 0;
}
======================================

It used 1080 ms on my computer.

A optimized c++ version:
======================================
#include <iostream>
#include <string>
#include <fstream>
#include <iterator>
#include <sys/time.h>
#include <vector>
using namespace std;


size_t my_hash (const char* str)
{
size_t ret = 0;
while (*str)
ret = 11 * ret + *str++;
return ret;
}

class Email
{
private:
size_t hash;
const char* mail;
friend bool operator < (const Email& lhs, const Email& rhs);
public:
Email (const char* mail_)
: hash(my_hash(mail_)), mail(mail_)
{
}

bool operator == (const Email& rhs)
{
if (hash == rhs.hash)
return strcmp(mail, rhs.mail) == 0;
return false;
}

const char* getEmail()const
{
return mail;
}
};

bool operator < (const Email& lhs, const Email& rhs)
{
if (lhs.hash == rhs.hash)
return strcmp(lhs.mail, rhs.mail) < 0;
return lhs.hash < rhs.hash;
}

int main(int argc, char** argv)
{
if (argc < 3)
{
cout << "Wrong arguments" << endl;
return 1;
}

FILE* fin = fopen(argv[1], "r");
if (!fin)
{
cout << "Invalid input file" << endl;
return 2;
}
FILE* fout = fopen(argv[2], "w");
if (!fout)
{
fclose(fin);
cout << "Invalid output file" << endl;
return 3;
}

timeval start, end;

const int BUF_SIZE = 20 * 1024 * 1024;
char* buffer = new char[BUF_SIZE];
memset(buffer, 0, BUF_SIZE);

gettimeofday(&start, 0);
vector<Email> emails;

size_t read = fread (buffer, BUF_SIZE, 1, fin);
char* begin = buffer;
char* current = buffer;

while (*current != '\0')
{
if (*current == '\n')
{
*current = '\0';
emails.push_back(begin);
begin = current + 1;
}
++ current;
}
fclose(fin);
sort(emails.begin(), emails.end());
emails.erase (unique( emails.begin(), emails.end() ), emails.end());

for (vector<Email>::const_iterator iter = emails.begin();
iter != emails.end();
iter ++)
{
fputs((*iter).getEmail(), fout);
fwrite("\n", 1, 1, fout);
}

fclose(fout);

gettimeofday(&end, 0);

printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec
- start.tv_usec) / 1000));

delete[] buffer;

return 0;
}
=====================================================
It used 680 ms.

I modified the D version to:
====================================================
import std.stdio;
import std.string;
import std.perf;

int main(char[][] argv)
{
if (argv.length < 3) {
writefln("Wrong arguments");
return 1;
}

const int BUF_SIZE = 20 * 1024 * 1024;
char* buffer = new char[BUF_SIZE];

PerformanceCounter counter = new PerformanceCounter();
counter.start();

FILE* fin = fopen(argv[1], "r");
FILE* fout = fopen(argv[2], "w");

fread(buffer, BUF_SIZE, 1, fin);
fclose(fin);

int[char[]] emails;
char* begin = buffer;
char* current = begin;
char* newline = cast(char*)"\n";

counter.stop();
writefln(counter.milliseconds());

while (*current != '\0')
{
if (*current == '\n')
{
*current = '\0';
char[] email = toString(begin);
if (!(email in emails))
{
emails[email] = 0;
fputs (begin, fout);
fwrite(newline, 1, 1, fout);
}
begin = current + 1;
}
++ current;
}

fclose(fout);
counter.stop();

writefln(counter.milliseconds());

delete buffer;
return 0;
}
====================================================

But it used 3600 ms... :(


I don't know how to accelerate it in D. :(
Can you help me?





More information about the Digitalmars-d mailing list