A few ideas ...
With large files, it's better to use mmap
instead of reading in the data.
If we remember the starting offset of a line and its length [in an array of "line" struct
], we don't have to remove the newline [so the file can be mapped R/O].
With this, each line only uses two size_t
values for offset/length. (i.e.) We don't have to strdup/malloc
the actual data.
We can put line ranges into a struct
(called a "segment" in the code below). We can have as many as we want [vs. just the 3 in the problem].
We can stop reading after we hit the maximum line number. This is a major reduction in space (e.g. In your example, the maximum line number is 32767).
Doing fprintf
with a format of %s%s%s\n"
to create a joined output line is relatively slow. Better to use fputc
or fwrite
For some background on the mmap
idea, here is an answer of mine: read line by line in the most efficient way platform specific
Here is the reimagined code. It is annotated.
For testing, it allows a -G
option to specify the generation of a random input test file.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/mman.h>
#define sysfault(_fmt...) \
do { \
printf(_fmt); \
exit(1); \
} while (0)
#if DEBUG
#define dbgprt(_fmt...) \
printf(_fmt)
#else
#define dbgprt(_fmt...) \
do { } while (0)
#endif
// dynamic list control
typedef struct {
void *list_base; // pointer to data
size_t list_size; // sizeof(element)
size_t list_count; // # of elements used
size_t list_cap; // # of elements allocated (capacity)
} list_t;
typedef struct {
size_t line_off; // absolute offset of line start
size_t line_len; // line length (inc newline)
} line_t;
typedef struct {
size_t seg_beg; // starting line index (inclusive)
size_t seg_end; // ending line index (inclusive)
size_t seg_cur; // current line index (inclusive)
size_t seg_count; // line count
} seg_t;
int opt_a; // load _all_ data
size_t opt_G; // generate input of this many bytes
int opt_L; // maximum line width
list_t *seglist;
size_t segend; // maximum line number we have to get
const char *inpbase; // pointer to input file data
size_t inpsize; // input file size
list_t *linelist; // list of lines
// listnew -- get new list
list_t *
listnew(size_t siz)
{
list_t *list = calloc(1,sizeof(*list));
list->list_size = siz;
return list;
}
// listloc -- point to list entry
void *
listloc(list_t *list,size_t idx)
{
void *ptr = list->list_base;
ptr += idx * list->list_size;
return ptr;
}
// listadd -- point to new list entry
void *
listadd(list_t *list)
{
if (list->list_count >= list->list_cap) {
list->list_cap += 100;
list->list_base = realloc(list->list_base,
list->list_size * list->list_cap);
}
void *ptr = listloc(list,list->list_count);
list->list_count += 1;
return ptr;
}
// seginit -- initialize line number range segment
void
seginit(size_t beg,size_t end)
{
seg_t *seg = listadd(seglist);
seg->seg_beg = beg;
seg->seg_end = end;
seg->seg_count = (end + 1) + beg;
if (end > segend)
segend = end;
printf("seginit: seg_beg=%zu seg_end=%zu seg_count=%zu\n",
seg->seg_beg,seg->seg_end,seg->seg_count);
seg->seg_cur = seg->seg_beg;
}
// linegeta -- read in all input lines
void
linegeta(size_t maxlines)
{
const char *lhs = inpbase;
size_t remsize = inpsize;
const char *rhs;
size_t curlen;
if (maxlines != 0)
maxlines += 20;
printf("lineget: remsize=%zu maxlines=%zu\n",remsize,maxlines);
list_t *list = listnew(sizeof(line_t));
linelist = list;
for (; remsize > 0; remsize -= curlen) {
rhs = memchr(lhs,'\n',remsize);
if (rhs == NULL)
break;
// point past newline
++rhs;
// get length of line (inc newline)
curlen = rhs - lhs;
// remember starting offset and length of current line
line_t *line = listadd(list);
line->line_off = lhs - inpbase;
line->line_len = curlen;
// point past newline
lhs = rhs;
// stop on line limit
if (maxlines != 0) {
if (list->list_count >= maxlines)
break;
}
}
printf("lineget: list_count=%zu\n",list->list_count);
}
// segput -- output a line segment
size_t
segput(FILE *xfout,seg_t *seg)
{
size_t outlen = 0;
do {
size_t lnoidx = seg->seg_cur;
int over = (lnoidx > seg->seg_end);
dbgprt("segput: seg_cur=%zu seg_end=%zu%s\n",
seg->seg_cur,seg->seg_end,over ? " OVER" : "");
if (over)
break;
++seg->seg_cur;
const line_t *line = listloc(linelist,lnoidx);
const char *buf = inpbase + line->line_off;
size_t explen = line->line_len - 1;
outlen = fwrite(buf,1,explen,xfout);
if (outlen == explen)
break;
sysfault("segput: explen=%zu outlen=%zu -- %s\n",
explen,outlen,strerror(errno));
} while (0);
return outlen;
}
// lineput -- create an output line
int
lineput(FILE *xfout)
{
list_t *list = seglist;
int outcnt = 0;
for (int segidx = 0; segidx < list->list_count; ++segidx) {
seg_t *seg = listloc(list,segidx);
if (segput(xfout,seg))
++outcnt;
}
if (outcnt > 0)
fprintf(xfout,"\n");
return outcnt;
}
// lineputa -- output all joined lines
void
lineputa(FILE *xfout)
{
size_t outcnt = 0;
while (1) {
if (! lineput(xfout))
break;
++outcnt;
if (outcnt > linelist->list_count)
sysfault("lineputa: too much output\n");
}
}
// xrand -- fast random number generator
unsigned int
xrand(void)
{
static unsigned int seed = 1;
// this is random in TYPE_0 (linear congruential) mode with the mask
// removed to get full range numbers
// this does _not_ produce overlaps
seed = (seed * 1103515245) + 12345;
return seed;
}
// genline -- generate a line number
size_t
genline(FILE *xfdst,size_t lno)
{
char buf[1000];
char *bp = buf;
size_t off = ftell(xfdst);
bp += sprintf(bp," %zu/%zu:",off,lno);
ssize_t remlen = bp - buf;
remlen = (opt_L - 20) - remlen;
if (remlen <= 0)
remlen = 1;
remlen = (xrand() % remlen) + 1;
for (; remlen > 0; --remlen, ++bp) {
int chr = xrand() % 26;
chr += 'A';
*bp = chr;
}
*bp++ = '\n';
size_t explen = bp - buf;
size_t actlen = fwrite(buf,1,explen,xfdst);
if (actlen != explen)
sysfault("genline: fwrite lno=%zu -- %s\n",lno,strerror(errno));
return actlen;
}
// genfile -- generate random input file
void
genfile(const char *file,size_t maxsize,size_t lnomax)
{
size_t totsize = 0;
size_t curlen;
size_t lnocur = 1;
if (lnomax != 0)
lnomax += 1000;
printf("genfile: maxsize=%zu lnomax=%zu\n",maxsize,lnomax);
FILE *xfdst = fopen(file,"w");
for (; (totsize < maxsize); totsize += curlen, ++lnocur) {
if ((lnomax != 0) && (lnocur >= lnomax))
break;
curlen = genline(xfdst,lnocur);
}
fclose(xfdst);
printf("genfile: totsize=%zu lnocur=%zu\n",totsize,lnocur);
}
int
main(int argc,char **argv)
{
--argc;
++argv;
for (; argc > 0; --argc, ++argv) {
char *cp = *argv;
if (*cp != '-')
break;
cp += 2;
switch (cp[-1]) {
case 'a':
opt_a = ! opt_a;
break;
case 'G':
opt_G = (*cp != 0) ? strtol(cp,&cp,10) : 100;
switch (*cp) {
case 'M':
opt_G *= 1024 * 1024;
break;
default:
opt_G *= 1024 * 1024 * 1024;
break;
}
break;
case 'L':
opt_L = (*cp != 0) ? strtol(cp,&cp,10) : 40;
break;
}
}
if (opt_L <= 0)
opt_L = 40;
// get ranges we want
seglist = listnew(sizeof(seg_t));
seginit(5,10923); // Permno number
seginit(10927,21846); // PRC date
seginit(21849,32767); // DLRETX data
if (opt_a)
segend = 0;
if (argc != 2)
sysfault("main: need two args\n");
const char *inpfile = *argv++;
const char *outfile = *argv++;
// DEBUG: generate random test input file (careful!)
if (opt_G)
genfile(inpfile,opt_G,segend);
int fdinp = open(inpfile,O_RDONLY);
if (fdinp < 0)
sysfault("main: unable to open '%s' -- %s\n",inpfile,strerror(errno));
struct stat st;
fstat(fdinp,&st);
inpsize = st.st_size;
printf("st_size=%zu\n",inpsize);
// map the input file
void *base = mmap(NULL,inpsize,PROT_READ,MAP_SHARED,fdinp,0);
if (base == MAP_FAILED)
sysfault("main: unable to mmap -- %s\n",strerror(errno));
inpbase = base;
// find all input line offsets and lengths
linegeta(segend);
FILE *xfout = fopen(outfile,"w");
if (xfout == NULL)
sysfault("main: unable to open '%s' -- %s\n",inpfile,strerror(errno));
// output all joined lines
lineputa(xfout);
fclose(xfout);
munmap(base,st.st_size);
close(fdinp);
return 0;
}