Try scanning through your big file without storing it all to memory, just keeping one record at a time in a local variables:
void csvReader(FILE *f) {
T_structCDT c;
int count = 0;
c.string = malloc(1000);
while (fscanf(f, "%d,%d,%d,%999[^,],%d\n", &c.a, &c.b, &c.vivienda, c.c, &c.d) == 5) {
// nothing for now
count++;
}
printf("%d records parsed\n");
}
Measure the time taken for this simplistic parser:
If it is quick enough, perform the tests for selection and output the few matching records one at a time at they are found during the parse phase. The extra time for these steps should be fairly small since only a few records match.
It the time is too long, you need a more fancy CSV parser, which is a lot of work but can be done and made fast, especially if you can assume your input file to use this simple format for all records. This is too broad a subject to detail here but the speed achievable should be close to that of cat csvfile > /dev/null
or grep a_short_string_not_present csvfile
On my system (average linux server with regular hard disk), it takes less than 20 seconds to parse 40 million lines totalling 2GB from a cold start, and less than 4 seconds the second time: disk I/O seems to be the bottleneck.
If you need to perform this selection very often, you should probably use a different data format, possibly a database system. If the scan is performed occasionally on data whose format is fixed, using faster storage such as SSD will help but don't expect miracles.
EDIT To put words in action, I wrote a simple generator and extractor:
Here is a simple program to generate CSV data:
#include <stdio.h>
#include <stdlib.h>
const char *dict[] = {
"Lorem", "ipsum", "dolor", "sit", "amet;", "consectetur", "adipiscing", "elit;",
"sed", "do", "eiusmod", "tempor", "incididunt", "ut", "labore", "et",
"dolore", "magna", "aliqua.", "Ut", "enim", "ad", "minim", "veniam;",
"quis", "nostrud", "exercitation", "ullamco", "laboris", "nisi", "ut", "aliquip",
"ex", "ea", "commodo", "consequat.", "Duis", "aute", "irure", "dolor",
"in", "reprehenderit", "in", "voluptate", "velit", "esse", "cillum", "dolore",
"eu", "fugiat", "nulla", "pariatur.", "Excepteur", "sint", "occaecat", "cupidatat",
"non", "proident;", "sunt", "in", "culpa", "qui", "officia", "deserunt",
"mollit", "anim", "id", "est", "laborum.",
};
int csvgen(const char *fmt, long lines) {
char buf[1024];
if (*fmt == '\0')
return 1;
while (lines > 0) {
size_t pos = 0;
int count = 0;
for (const char *p = fmt; *p && pos < sizeof(buf); p++) {
switch (*p) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
count = count * 10 + *p - '0';
continue;
case 'd':
if (!count) count = 101;
pos += snprintf(buf + pos, sizeof(buf) - pos, "%d",
rand() % (2 + count - 1) - count + 1);
count = 0;
continue;
case 'u':
if (!count) count = 101;
pos += snprintf(buf + pos, sizeof(buf) - pos, "%u",
rand() % count);
count = 0;
continue;
case 's':
if (!count) count = 4;
count = rand() % count + 1;
while (count-- > 0 && pos < sizeof(buf)) {
pos += snprintf(buf + pos, sizeof(buf) - pos, "%s ",
dict[rand() % (sizeof(dict) / sizeof(*dict))]);
}
if (pos < sizeof(buf)) {
pos--;
}
count = 0;
continue;
default:
buf[pos++] = *p;
count = 0;
continue;
}
}
if (pos < sizeof(buf)) {
buf[pos++] = '\n';
fwrite(buf, 1, pos, stdout);
lines--;
}
}
return 0;
}
int main(int argc, char *argv[]) {
if (argc < 3) {
fprintf(stderr, "usage: csvgen format number\n");
return 2;
}
return csvgen(argv[1], strtol(argv[2], NULL, 0));
}
Here is an extractor with 3 different parsing methods:
#include <stdio.h>
#include <stdlib.h>
static inline unsigned int getuint(const char *p, const char **pp) {
unsigned int d, n = 0;
while ((d = *p - '0') <= 9) {
n = n * 10 + d;
p++;
}
*pp = p;
return n;
}
int csvgrep(FILE *f, int method) {
struct {
int a, b, c, d;
int spos, slen;
char s[1000];
} c;
int count = 0, line = 0;
// select 500 out of 43M
#define select(c) ((c).a == 100 && (c).b == 100 && (c).c > 74 && (c).d > 50)
if (method == 0) {
// default method: fscanf
while (fscanf(f, "%d,%d,%d,%999[^,],%d\n", &c.a, &c.b, &c.c, c.s, &c.d) == 5) {
line++;
if (select(c)) {
count++;
printf("%d,%d,%d,%s,%d\n", c.a, c.b, c.c, c.s, c.d);
}
}
} else
if (method == 1) {
// use fgets and simple parser
char buf[1024];
while (fgets(buf, sizeof(buf), f)) {
char *p = buf;
int i;
line++;
c.a = strtol(p, &p, 10);
p += (*p == ',');
c.b = strtol(p, &p, 10);
p += (*p == ',');
c.c = strtol(p, &p, 10);
p += (*p == ',');
for (i = 0; *p && *p != ','; p++) {
c.s[i++] = *p;
}
c.s[i] = '\0';
p += (*p == ',');
c.d = strtol(p, &p, 10);
if (*p != '\n') {
fprintf(stderr, "csvgrep: invalid format at line %d\n", line);
continue;
}
if (select(c)) {
count++;
printf("%d,%d,%d,%s,%d\n", c.a, c.b, c.c, c.s, c.d);
}
}
} else
if (method == 2) {
// use fgets and hand coded parser, positive numbers only, no string copy
char buf[1024];
while (fgets(buf, sizeof(buf), f)) {
const char *p = buf;
line++;
c.a = getuint(p, &p);
p += (*p == ',');
c.b = getuint(p, &p);
p += (*p == ',');
c.c = getuint(p, &p);
p += (*p == ',');
c.spos = p - buf;
while (*p && *p != ',') p++;
c.slen = p - buf - c.spos;
p += (*p == ',');
c.d = getuint(p, &p);
if (*p != '\n') {
fprintf(stderr, "csvgrep: invalid format at line %d\n", line);
continue;
}
if (select(c)) {
count++;
printf("%d,%d,%d,%.*s,%d\n", c.a, c.b, c.c, c.slen, buf + c.spos, c.d);
}
}
} else {
fprintf(stderr, "csvgrep: unknown method: %d\n", method);
return 1;
}
fprintf(stderr, "csvgrep: %d records selected from %d lines\n", count, line);
return 0;
}
int main(int argc, char *argv[]) {
if (argc > 2 && strtol(argv[2], NULL, 0)) {
// non zero second argument -> set a 1M I/O buffer
setvbuf(stdin, NULL, _IOFBF, 1024 * 1024);
}
return csvgrep(stdin, argc > 1 ? strtol(argv[1], NULL, 0) : 0);
}
And here are some comparative benchmark figures:
$ time ./csvgen "u,u,u,s,u" 43000000 > 43m
real 0m34.428s user 0m32.911s sys 0m1.358s
$ time grep zz 43m
real 0m10.338s user 0m10.069s sys 0m0.211s
$ time wc -lc 43m
43000000 1195458701 43m
real 0m1.043s user 0m0.839s sys 0m0.196s
$ time cat 43m > /dev/null
real 0m0.201s user 0m0.004s sys 0m0.195s
$ time ./csvgrep 0 < 43m > x0
csvgrep: 508 records selected from 43000000 lines
real 0m14.271s user 0m13.856s sys 0m0.341s
$ time ./csvgrep 1 < 43m > x1
csvgrep: 508 records selected from 43000000 lines
real 0m8.235s user 0m7.856s sys 0m0.331s
$ time ./csvgrep 2 < 43m > x2
csvgrep: 508 records selected from 43000000 lines
real 0m3.892s user 0m3.555s sys 0m0.312s
$ time ./csvgrep 2 1 < 43m > x3
csvgrep: 508 records selected from 43000000 lines
real 0m3.706s user 0m3.488s sys 0m0.203s
$ cmp x0 x1
$ cmp x0 x2
$ cmp x0 x3
As you can see, specializing the parse method provides a gain of almost 50% and hand coding the integer conversion and string scanning gains another 50%. Using a 1 megabyte buffer instead of the default size offers only a marginal gain of 0.2sec.
To further improve the speed, you can use mmap()
to bypass the I/O streaming interface and make stronger assumptions about the file contents. In the above code, invalid formats are still handled gracefully, but you could remove some tests and shave an extra 5% from the execution time at the cost of reliability.
The above benchmark is performed on a system with an SSD drive and the file 43m
fits in RAM, so the timings do not include much disk I/O latency. grep
is surprisingly slow, and increasing the search string length makes it even worse... wc -lc
set a target for scanning performance, a factor of 4, but cat
seems out of reach.