How to accelerate this program?
Dave
Dave_member at pathlink.com
Mon Apr 3 08:16:53 PDT 2006
>From what I've seen, the bottleneck is probably in I/O. Use BufferedFile and a
buffer for each readline. On my system with a file made of 440,000 lines of
(just 4 repeated) seperate emails, this version takes ~200 ms vs. ~800 ms for
your C++ version. Buffering I/O is key in D w/ the phobos I/O routines, as the
built-in AA's are pretty fast.
Let us know your results (you'll have a lot more AA inserts and email dups).
Note you have to dup the string when you insert into your AA.
Try this (dmd v0.150):
import std.stdio;
import std.stream;
import std.perf;
int main(char[][] argv)
{
if (argv.length < 3)
{
writefln("Wrong arguments");
return 1;
}
char[8192] bufr;
int[char[]] emails;
char[] email;
PerformanceCounter counter = new PerformanceCounter();
counter.start();
BufferedFile bsi = new BufferedFile(argv[1]);
BufferedFile bso = new BufferedFile(argv[2],FileMode.Out);
while(!bsi.eof)
{
email = bsi.readLine(bufr); // bufr is key to perf.
if (!(email in emails))
{
emails[email.dup] = 0; // Note .dup
}
}
foreach(key; emails.keys.sort)
{
bso.write(cast(ubyte[])key);
bso.write('\n');
}
bso.close;
bsi.close;
counter.stop();
writefln(counter.milliseconds());
return 0;
}
In article <e0qqb3$3gt$1 at digitaldaemon.com>, Li Jie says...
>
>At first, thank all my helpers. :)
>
>
>I have some email addresses in a text file, about 780000 lines. There are some
>repeated lines, about 8%, and I want to remove them.
>
>I write a D version:
>=======================================
>import std.stdio;
>import std.string;
>import std.stdio;
>import std.string;
>import std.perf;
>
>int main(char[][] argv)
>{
>if (argv.length < 3) {
>writefln("Wrong arguments");
>return 1;
>}
>
>const int READ_SIZE = 1024;
>
>FILE* fin = fopen(argv[1], "r");
>FILE* fout = fopen(argv[2], "w");
>char buffer[READ_SIZE];
>int[char[]] emails;
>
>PerformanceCounter counter = new PerformanceCounter();
>counter.start();
>while (!feof(fin)){
>fgets(cast(char*)buffer, READ_SIZE, fin);
>char[] email = toString(cast(char*)buffer);
>if (!(email in emails)){
>emails[toString(buffer)] = 0;
>fputs(cast(char*)email, fout);
>}
>}
>
>fclose(fout);
>fclose(fin);
>counter.stop();
>
>writefln(counter.milliseconds());
>return 0;
>}
>======================================
>
>It used 1080 ms on my computer.
>
>A optimized c++ version:
>======================================
>#include <iostream>
>#include <string>
>#include <fstream>
>#include <iterator>
>#include <sys/time.h>
>#include <vector>
>using namespace std;
>
>
>size_t my_hash (const char* str)
>{
>size_t ret = 0;
>while (*str)
>ret = 11 * ret + *str++;
>return ret;
>}
>
>class Email
>{
>private:
>size_t hash;
>const char* mail;
>friend bool operator < (const Email& lhs, const Email& rhs);
>public:
>Email (const char* mail_)
>: hash(my_hash(mail_)), mail(mail_)
>{
>}
>
>bool operator == (const Email& rhs)
>{
>if (hash == rhs.hash)
>return strcmp(mail, rhs.mail) == 0;
>return false;
>}
>
>const char* getEmail()const
>{
>return mail;
>}
>};
>
>bool operator < (const Email& lhs, const Email& rhs)
>{
>if (lhs.hash == rhs.hash)
>return strcmp(lhs.mail, rhs.mail) < 0;
>return lhs.hash < rhs.hash;
>}
>
>int main(int argc, char** argv)
>{
>if (argc < 3)
>{
>cout << "Wrong arguments" << endl;
>return 1;
>}
>
>FILE* fin = fopen(argv[1], "r");
>if (!fin)
>{
>cout << "Invalid input file" << endl;
>return 2;
>}
>FILE* fout = fopen(argv[2], "w");
>if (!fout)
>{
>fclose(fin);
>cout << "Invalid output file" << endl;
>return 3;
>}
>
>timeval start, end;
>
>const int BUF_SIZE = 20 * 1024 * 1024;
>char* buffer = new char[BUF_SIZE];
>memset(buffer, 0, BUF_SIZE);
>
>gettimeofday(&start, 0);
>vector<Email> emails;
>
>size_t read = fread (buffer, BUF_SIZE, 1, fin);
>char* begin = buffer;
>char* current = buffer;
>
>while (*current != '\0')
>{
>if (*current == '\n')
>{
>*current = '\0';
>emails.push_back(begin);
>begin = current + 1;
>}
>++ current;
>}
>fclose(fin);
>sort(emails.begin(), emails.end());
>emails.erase (unique( emails.begin(), emails.end() ), emails.end());
>
>for (vector<Email>::const_iterator iter = emails.begin();
>iter != emails.end();
>iter ++)
>{
>fputs((*iter).getEmail(), fout);
>fwrite("\n", 1, 1, fout);
>}
>
>fclose(fout);
>
>gettimeofday(&end, 0);
>
>printf("Time used: %d ms\n", ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec
>- start.tv_usec) / 1000));
>
>delete[] buffer;
>
>return 0;
>}
>=====================================================
>It used 680 ms.
>
>I modified the D version to:
>====================================================
>import std.stdio;
>import std.string;
>import std.perf;
>
>int main(char[][] argv)
>{
>if (argv.length < 3) {
>writefln("Wrong arguments");
>return 1;
>}
>
>const int BUF_SIZE = 20 * 1024 * 1024;
>char* buffer = new char[BUF_SIZE];
>
>PerformanceCounter counter = new PerformanceCounter();
>counter.start();
>
>FILE* fin = fopen(argv[1], "r");
>FILE* fout = fopen(argv[2], "w");
>
>fread(buffer, BUF_SIZE, 1, fin);
>fclose(fin);
>
>int[char[]] emails;
>char* begin = buffer;
>char* current = begin;
>char* newline = cast(char*)"\n";
>
>counter.stop();
>writefln(counter.milliseconds());
>
>while (*current != '\0')
>{
>if (*current == '\n')
>{
>*current = '\0';
>char[] email = toString(begin);
>if (!(email in emails))
>{
>emails[email] = 0;
>fputs (begin, fout);
>fwrite(newline, 1, 1, fout);
>}
>begin = current + 1;
>}
>++ current;
>}
>
>fclose(fout);
>counter.stop();
>
>writefln(counter.milliseconds());
>
>delete buffer;
>return 0;
>}
>====================================================
>
>But it used 3600 ms... :(
>
>
>I don't know how to accelerate it in D. :(
>Can you help me?
>
>
More information about the Digitalmars-d
mailing list