I am trying to write a reasonably efficient file read-in routine. My data file is a text file with several "frames". Each frame has 2 header-lines and a number of items, as follows
<int "nitems">
<float> <float> <float>
<string1> <float> <float> <float>
<string2> <float> <float> <float>
...
<string-nitems> <float> <float> <float>
My current implementation uses fstream to retrieve numbers but seems horribly slow. My test file contains about 200 frames of 10.000 lines each (~75 Mb) which takes 2.5 seconds to read!
int loadframe() {
_file >> _nat;
_file >> _cell[0] >> _cell[1] >> _cell[2];
for(int i=0,k=0;i<_nat;i++) {
_file >> _types[i] >> _pos[k++] >> _pos[k++] >> _pos[k++]; // this line !!!
}
return 0;
}
_file is an ifstream (opened elsewhere), _types is a vector of strings, _cell and _pos are vectors of doubles.
Does anyone have suggestions how to speed this up?
Thanks.
Update 1
Rewriting this part with fscanf reduced the time from ~2.5 seconds to ~1.8 seconds: about 30% gain, not bad. _f
is now of type FILE* _f = fopen(filename,"r")
object. The lines below fscanf are for the casting (if needed) but do not take up any significant time as can be seen when commenting them out.
int loadxyz() {
char c[16];
float x0,x1,x2;
fscanf(_f,"%d",&_nat);
fscanf(_f,"%f%f%f",&x0,&x1,&x2;
_cell[0]=x1; _cell[1]=x2; _cell[2]=x3;
for(int i=0, k=0;i<_nat;i++,k+=3) {
fscanf(_f,"%s%f%f%f",&c,&x0,&x1,&x2);
_types[i]=c; _pos[k]=x0; _pos[k+1]=x1; _pos[k+2]=x2;
}
return 0;
}
Update 2
Based on suggestions below I wrote a small benchmark program, which shows that Nim's solution to clearly fastest. Compiler optimization does not have any significant effect in my case. For anyone who wants to try, I added the source below. A recent compiler is needed g++ -std=c++11 readtest.cpp -o readtest
.
Thanks! If anyone has yet other suggestions I would be more than happy to add/benchmark them.
The result (testfile is ~32Mb)
$ ./readtest
write : took 1.97 seconds
check = 549755289600.00
read1 (ifstream) : took 1.10 seconds
check = 549755289600.00
read2 (fscanf) : took 0.64 seconds
check = 549755289600.00
read3 (stream+strtod) : took 0.41 seconds
Here is the source of readtest.cpp
:
#include <stdio.h> // printf, fopen, fclose, fprintf,
#include <stdlib.h> // strtod
#include <fstream> // ifstream
#include <string> // string
#include <ctime> // clock
#define N 1048576 // 1024*1024 number of lines
using namespace std;
void write(string name) {
FILE* f = fopen(name.c_str(),"w");
for(float i=0;i<N;i++)
fprintf(f,"%s %.2f %.2f %.2f\n","x",i,i,i); // write some formatted data
fclose(f);
}
void read1(string name) {
double num,check=0;
string s;
ifstream f(name);
for(int i=0;i<N;i++) {
f >> s;
f >> num;
f >> num;
f >> num;
check+=num;
}
printf("check = %.2f\n",check);
f.close();
}
void read2(string name) {
double num,check=0;
char c[16];
string s;
FILE* f=fopen(name.c_str(),"r");
while(fscanf(f,"%s%lf%lf%lf",&c,&num,&num,&num)!=EOF) {
s = c;
check+=num;
}
printf("check = %.2f\n",check);
fclose(f);
}
void read3(string name) {
string line, s;
double num,check=0;
ifstream f(name);
while(getline(f,line)) {
size_t start = line.find_first_not_of(" \t");
size_t pos = line.find(" ");
char* c = &*(line.begin() + pos + 1);
s = line.substr(start,pos+1);
num = strtod(c+start, &c);
num = strtod(c, &c);
num = strtod(c, &c);
check+=num;
}
printf("check = %.2f\n",check);
f.close();
}
int main() {
clock_t start, end;
string name("testfile.dat");
start = clock();
write(name);
end = clock();
printf("write : took %.2f seconds\n",double(end-start)/CLOCKS_PER_SEC);
start = clock();
read1(name);
end = clock();
printf("read1 (ifstream) : took %.2f seconds\n",double(end-start)/CLOCKS_PER_SEC);
start = clock();
read2(name);
end = clock();
printf("read2 (fscanf) : took %.2f seconds\n",double(end-start)/CLOCKS_PER_SEC);
start = clock();
read3(name);
end = clock();
printf("read3 (stream+strtod) : took %.2f seconds\n",double(end-start)/CLOCKS_PER_SEC);
}