3

The provided code reads lines from a text file and stores them in a dynamically allocated 2D array called lines (the pattern always repeats after line 32767). It then concatenates specific lines (lines[j], lines[k], lines[m]) and prints the result in an output file. The problem is that my input.txt file is so huge that my PC cannot handle this - 100GB. Because with malloc I allocate so much memory for a 2D array that my hard drive is full. Is there any possibility to change my code that i did not have to allocate so much memory? Thanks for any help!

#include <stdlib.h>
#include <string.h>

#define MAX_LINE_LENGTH 300          // specifies the maximum length of a line in the input file
#define MAX_LINES 500000000          // specifies the maximum number of lines of the input file

int main()
{
   // Open the file
   FILE *fp = fopen("input.txt", "r");
   if (fp == NULL)
   {
       printf("Failed to open the file.\n");
       return 1;
   }


   // Allocate memory for lines
   char **lines = (char **)malloc(MAX_LINES * sizeof(char *));
   if (lines == NULL)
   {
       printf("Failed to allocate memory for lines.\n");
       fclose(fp);
       return 1;
   }

   for (int i = 0; i < MAX_LINES; i++)
   {
       lines[i] = (char *)malloc(MAX_LINE_LENGTH * sizeof(char));
       if (lines[i] == NULL)
       {
           printf("Failed to allocate memory for line %d.\n", i);
           // Free previously allocated lines
           for (int j = 0; j < i; j++)
           {
               free(lines[j]);
           }
           free(lines);
           fclose(fp);
           return 1;
       }
   }

   // Reset file pointer to the beginning
   fseek(fp, 0, SEEK_SET);

   // Read lines from the file and store them in lines array
   int lineIndex = 0;
   char line[MAX_LINE_LENGTH];
   while (fgets(line, sizeof(line), fp) != NULL)
   {
       // Remove newline character from the end of the line
       if (line[strlen(line) - 1] == '\n')
           line[strlen(line) - 1] = '\0';
       strcpy(lines[lineIndex], line);
       lineIndex++;
   }

   fclose(fp);

   // Create output file
   FILE *outputFile = fopen("output.txt", "w");
   if (outputFile == NULL)
   {
       perror("Error creating output file");
       // Free allocated lines
       for (int i = 0; i < MAX_LINES; i++)
       {
           free(lines[i]);
       }
       free(lines);
       return 1;
   }
   int i=1;
   int j=5;         //begin Permno number
   int k=10927;     //begin PRC date
   int m=21849;     //begin DLRETX data
   int count=0;

   int jj=10923;    // end PERNMO data
   int kk=21846;    // end PRC data
   int mm=32767;    // end DLRETX data
   int count_whileLoop=0;


   // Append lines
   for (i,j,k,m;  j<=jj,k<=kk,m<=mm;  i++,j++,k++,m++)
   {
       fprintf(outputFile, "%s%s%s\n", lines[j], lines[k], lines[m]);

   }

   // Free allocated memory
   for (int i = 0; i < MAX_LINES; i++)
   {
       free(lines[i]);
   }
   free(lines);

   // Close the output file
   fclose(outputFile);

   return 0;
}

John Black
  • 41
  • 7
  • 1
    Could you not parse the file, keep a pointer where the line starts, and then just read the required lines, instead of trying to throw 100GB into RAM? – infinitezero Jun 14 '23 at 16:55
  • 4
    You're only processing the first 32767 lines, so why are you bothering to read more than that? – dbush Jun 14 '23 at 16:57
  • A first obvious reduction would be to allocate memory for each line after reading it. Then you only need to allow `strlen+1` bytes instead of maximum len. Then you would consume less memory per line and also not use any memory for lines that are not present in the file. Only storing lines you want to process would be another step as well, as mentioned by dbush already. – Gerhardh Jun 14 '23 at 17:03
  • 1
    Not related: `for (i,j,k,m; j<=jj,k<=kk,m<=mm; i++,j++,k++,m++)` This does not make any sense. If you don't need any initialization for the loop, then you can leave the first field empty. The condition does most likely not do what you might expect. If you want to check that all 3 variables are below the limit, you must use `&&` operator instead of `,`. And that will only work well, if the 3 ranges have the same number of values. – Gerhardh Jun 14 '23 at 17:05
  • 3
    A question to ask yourself: "Do I actually need to _store_ all of the lines at the same time?" See also [What is the XY problem?](https://meta.stackexchange.com/questions/66377/what-is-the-xy-problem) – Chris Jun 14 '23 at 18:05
  • The three groups of lines should contain the same number of lines. PERNMO data is from 5 to 10923 = 10919 lines. PRC data is from 10927 to 21846 = 10920 lines, DLRETX data is from 21849 to 32767 = 10919 lines. something's wrong. – Costantino Grana Jun 14 '23 at 18:07
  • Additional suggestion: consider using `realloc` to let your array of lines "grow" as necessary rather than immediately allocating five hundred _million_ char pointers. That's 4GB of memory used right off the bat. – Chris Jun 14 '23 at 18:16
  • @Gerhardh In this specific case, in which the three ranges should have the same length, the condition `j<=jj,k<=kk,m<=mm` works well, because the first and second conditions values are ignored and only the last one is used by the `for` loop. – Costantino Grana Jun 14 '23 at 18:19
  • You probably should be using a DBMS when dealing with such huge amounts of data, instead of re-inventing the wheel - square-shaped this time. – Lundin Jun 15 '23 at 13:18

3 Answers3

2

A few ideas ...

  1. With large files, it's better to use mmap instead of reading in the data.

  2. If we remember the starting offset of a line and its length [in an array of "line" struct], we don't have to remove the newline [so the file can be mapped R/O].

  3. With this, each line only uses two size_t values for offset/length. (i.e.) We don't have to strdup/malloc the actual data.

  4. We can put line ranges into a struct (called a "segment" in the code below). We can have as many as we want [vs. just the 3 in the problem].

  5. We can stop reading after we hit the maximum line number. This is a major reduction in space (e.g. In your example, the maximum line number is 32767).

  6. Doing fprintf with a format of %s%s%s\n" to create a joined output line is relatively slow. Better to use fputc or fwrite

For some background on the mmap idea, here is an answer of mine: read line by line in the most efficient way platform specific


Here is the reimagined code. It is annotated.

For testing, it allows a -G option to specify the generation of a random input test file.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/mman.h>

#define sysfault(_fmt...) \
    do { \
        printf(_fmt); \
        exit(1); \
    } while (0)

#if DEBUG
#define dbgprt(_fmt...) \
    printf(_fmt)
#else
#define dbgprt(_fmt...) \
    do { } while (0)
#endif

// dynamic list control
typedef struct {
    void *list_base;                    // pointer to data
    size_t list_size;                   // sizeof(element)
    size_t list_count;                  // # of elements used
    size_t list_cap;                    // # of elements allocated (capacity)
} list_t;

typedef struct {
    size_t line_off;                    // absolute offset of line start
    size_t line_len;                    // line length (inc newline)
} line_t;

typedef struct {
    size_t seg_beg;                     // starting line index (inclusive)
    size_t seg_end;                     // ending line index (inclusive)
    size_t seg_cur;                     // current line index (inclusive)
    size_t seg_count;                   // line count
} seg_t;

int opt_a;                              // load _all_ data
size_t opt_G;                           // generate input of this many bytes
int opt_L;                              // maximum line width

list_t *seglist;
size_t segend;                          // maximum line number we have to get

const char *inpbase;                    // pointer to input file data
size_t inpsize;                         // input file size
list_t *linelist;                       // list of lines

// listnew -- get new list
list_t *
listnew(size_t siz)
{
    list_t *list = calloc(1,sizeof(*list));

    list->list_size = siz;

    return list;
}

// listloc -- point to list entry
void *
listloc(list_t *list,size_t idx)
{
    void *ptr = list->list_base;
    ptr += idx * list->list_size;
    return ptr;
}

// listadd -- point to new list entry
void *
listadd(list_t *list)
{

    if (list->list_count >= list->list_cap) {
        list->list_cap += 100;
        list->list_base = realloc(list->list_base,
            list->list_size * list->list_cap);
    }

    void *ptr = listloc(list,list->list_count);
    list->list_count += 1;

    return ptr;
}

// seginit -- initialize line number range segment
void
seginit(size_t beg,size_t end)
{

    seg_t *seg = listadd(seglist);

    seg->seg_beg = beg;
    seg->seg_end = end;
    seg->seg_count = (end + 1) + beg;

    if (end > segend)
        segend = end;

    printf("seginit: seg_beg=%zu seg_end=%zu seg_count=%zu\n",
        seg->seg_beg,seg->seg_end,seg->seg_count);

    seg->seg_cur = seg->seg_beg;
}

// linegeta -- read in all input lines
void
linegeta(size_t maxlines)
{

    const char *lhs = inpbase;
    size_t remsize = inpsize;
    const char *rhs;
    size_t curlen;

    if (maxlines != 0)
        maxlines += 20;

    printf("lineget: remsize=%zu maxlines=%zu\n",remsize,maxlines);

    list_t *list = listnew(sizeof(line_t));
    linelist = list;

    for (;  remsize > 0;  remsize -= curlen) {
        rhs = memchr(lhs,'\n',remsize);
        if (rhs == NULL)
            break;

        // point past newline
        ++rhs;

        // get length of line (inc newline)
        curlen = rhs - lhs;

        // remember starting offset and length of current line
        line_t *line = listadd(list);
        line->line_off = lhs - inpbase;
        line->line_len = curlen;

        // point past newline
        lhs = rhs;

        // stop on line limit
        if (maxlines != 0) {
            if (list->list_count >= maxlines)
                break;
        }
    }

    printf("lineget: list_count=%zu\n",list->list_count);
}

// segput -- output a line segment
size_t
segput(FILE *xfout,seg_t *seg)
{
    size_t outlen = 0;

    do {
        size_t lnoidx = seg->seg_cur;
        int over = (lnoidx > seg->seg_end);
        dbgprt("segput: seg_cur=%zu seg_end=%zu%s\n",
            seg->seg_cur,seg->seg_end,over ? " OVER" : "");

        if (over)
            break;
        ++seg->seg_cur;

        const line_t *line = listloc(linelist,lnoidx);
        const char *buf = inpbase + line->line_off;

        size_t explen = line->line_len - 1;

        outlen = fwrite(buf,1,explen,xfout);
        if (outlen == explen)
            break;

        sysfault("segput: explen=%zu outlen=%zu -- %s\n",
            explen,outlen,strerror(errno));
    } while (0);

    return outlen;
}

// lineput -- create an output line
int
lineput(FILE *xfout)
{
    list_t *list = seglist;
    int outcnt = 0;

    for (int segidx = 0;  segidx < list->list_count;  ++segidx) {
        seg_t *seg = listloc(list,segidx);
        if (segput(xfout,seg))
            ++outcnt;
    }

    if (outcnt > 0)
        fprintf(xfout,"\n");

    return outcnt;
}

// lineputa -- output all joined lines
void
lineputa(FILE *xfout)
{
    size_t outcnt = 0;

    while (1) {
        if (! lineput(xfout))
            break;

        ++outcnt;
        if (outcnt > linelist->list_count)
            sysfault("lineputa: too much output\n");

    }
}

// xrand -- fast random number generator
unsigned int
xrand(void)
{
    static unsigned int seed = 1;

    // this is random in TYPE_0 (linear congruential) mode with the mask
    // removed to get full range numbers
    // this does _not_ produce overlaps
    seed = (seed * 1103515245) + 12345;

    return seed;
}

// genline -- generate a line number
size_t
genline(FILE *xfdst,size_t lno)
{
    char buf[1000];
    char *bp = buf;

    size_t off = ftell(xfdst);
    bp += sprintf(bp," %zu/%zu:",off,lno);

    ssize_t remlen = bp - buf;
    remlen = (opt_L - 20) - remlen;
    if (remlen <= 0)
        remlen = 1;
    remlen = (xrand() % remlen) + 1;

    for (;  remlen > 0;  --remlen, ++bp) {
        int chr = xrand() % 26;
        chr += 'A';
        *bp = chr;
    }

    *bp++ = '\n';

    size_t explen = bp - buf;
    size_t actlen = fwrite(buf,1,explen,xfdst);
    if (actlen != explen)
        sysfault("genline: fwrite lno=%zu -- %s\n",lno,strerror(errno));

    return actlen;
}

// genfile -- generate random input file
void
genfile(const char *file,size_t maxsize,size_t lnomax)
{

    size_t totsize = 0;
    size_t curlen;
    size_t lnocur = 1;

    if (lnomax != 0)
        lnomax += 1000;

    printf("genfile: maxsize=%zu lnomax=%zu\n",maxsize,lnomax);

    FILE *xfdst = fopen(file,"w");

    for (;  (totsize < maxsize);  totsize += curlen, ++lnocur) {
        if ((lnomax != 0) && (lnocur >= lnomax))
            break;
        curlen = genline(xfdst,lnocur);
    }

    fclose(xfdst);

    printf("genfile: totsize=%zu lnocur=%zu\n",totsize,lnocur);
}

int
main(int argc,char **argv)
{

    --argc;
    ++argv;

    for (;  argc > 0;  --argc, ++argv) {
        char *cp = *argv;
        if (*cp != '-')
            break;

        cp += 2;
        switch (cp[-1]) {
        case 'a':
            opt_a = ! opt_a;
            break;
        case 'G':
            opt_G = (*cp != 0) ? strtol(cp,&cp,10) : 100;
            switch (*cp) {
            case 'M':
                opt_G *= 1024 * 1024;
                break;
            default:
                opt_G *= 1024 * 1024 * 1024;
                break;
            }
            break;
        case 'L':
            opt_L = (*cp != 0) ? strtol(cp,&cp,10) : 40;
            break;
        }
    }

    if (opt_L <= 0)
        opt_L = 40;

    // get ranges we want
    seglist = listnew(sizeof(seg_t));
    seginit(5,10923);  // Permno number
    seginit(10927,21846);  // PRC date
    seginit(21849,32767);  // DLRETX data
    if (opt_a)
        segend = 0;

    if (argc != 2)
        sysfault("main: need two args\n");

    const char *inpfile = *argv++;
    const char *outfile = *argv++;

    // DEBUG: generate random test input file (careful!)
    if (opt_G)
        genfile(inpfile,opt_G,segend);

    int fdinp = open(inpfile,O_RDONLY);
    if (fdinp < 0)
        sysfault("main: unable to open '%s' -- %s\n",inpfile,strerror(errno));

    struct stat st;
    fstat(fdinp,&st);
    inpsize = st.st_size;
    printf("st_size=%zu\n",inpsize);

    // map the input file
    void *base = mmap(NULL,inpsize,PROT_READ,MAP_SHARED,fdinp,0);
    if (base == MAP_FAILED)
        sysfault("main: unable to mmap -- %s\n",strerror(errno));
    inpbase = base;

    // find all input line offsets and lengths
    linegeta(segend);

    FILE *xfout = fopen(outfile,"w");
    if (xfout == NULL)
        sysfault("main: unable to open '%s' -- %s\n",inpfile,strerror(errno));

    // output all joined lines
    lineputa(xfout);

    fclose(xfout);

    munmap(base,st.st_size);
    close(fdinp);

    return 0;
}
Craig Estey
  • 30,627
  • 4
  • 24
  • 48
  • A few "good" ideas! – David C. Rankin Jun 15 '23 at 06:31
  • Thank you, i need some time to check your code. Do you use linux? Because i get an error when i compile your code: #include no such file or directory. I think this header did not exist in windows? I use GNU GCC compiler – John Black Jun 15 '23 at 16:37
  • @JohnBlack _Do you use linux?_ Always ;-) You may need to install a development package. On fedora, it is `dnf install glibc-headers`. Under ubuntu, [I think] it is `apt install libc6-dev` – Craig Estey Jun 15 '23 at 18:26
2

Another take on the problem, in line with the original coding style.

Since you are reading lines of unspecified length, a better alternative to gets() is getline() (available here if you are using MSVC). It allows you to read lines without worry of correctly sizing the input buffer.

Given that we are appending, another useful function is defined here to realloc and append. Then things are pretty standard: I define a vector of sections starting points, the section length (so we don't need to worry that they are of the same size) and their number.

Then, as soon as we reach a section we start appending the first section, then the second, then the third, to the same lines. Output is just writing the lines to file.

#include <stdlib.h>
#include <string.h>
#include <stdio.h>

#ifdef _MSC_VER
#include "getline.h"
#endif

const int sections[] = {
    5,      // PERNMO
    10927,  // PRC
    21849,  // DLRETX
};
const int sections_lengths = 10919;
const int nsections = 3;

// Tries to realloc dst to allow appending src.
ssize_t strcat_realloc(char ** dst, const char *src)
{
    if (dst == NULL || src == NULL) {
        errno = EINVAL;
        return -1;
    }

    size_t ndst = *dst ? strlen(*dst) : 0;
    size_t nsrc = strlen(src);
    char *tmp = realloc(*dst, ndst + nsrc + 1);
    if (tmp == NULL) {
        errno = ENOMEM;
        return -1;
    }
    memcpy(tmp + ndst, src, nsrc + 1);
    *dst = tmp;
    return ndst + nsrc;
}

int main(void)
{
    FILE *fp = fopen("input.txt", "r");
    if (fp == NULL) {
        printf("Failed to open the file.\n");
        return 1;
    }

    char **lines = calloc(sections_lengths, sizeof(char*));

    int cur = 0;
    int lineIndex = 0;
    char *line = NULL;
    size_t len = 0;
    while (getline(&line,&len, fp) != -1) {
        line[strcspn(line, "\n")] = 0; // Remove newline character from the end of the line
        if (lineIndex == sections[cur] + sections_lengths) {
            ++cur;
            if (cur == nsections) {
                break;
            }
        }
        if (lineIndex >= sections[cur]) {
            strcat_realloc(lines + lineIndex - sections[cur], line); // TODO: manage errors
        }
        lineIndex++;
    }

    fclose(fp);

    int result = 0;
    // Create output file
    FILE *outputFile = fopen("output.txt", "w");
    if (outputFile == NULL) {
        perror("Error creating output file");
        result = 1;
    }
    else {
        // Append lines
        for (int i = 0; i < sections_lengths; ++i) {
            fprintf(outputFile, "%s\n", lines[i]);
        }
        fclose(outputFile);
    }

    // Free allocated memory
    for (int i = 0; i < sections_lengths; ++i) {
        free(lines[i]);
    }
    free(lines);

    return result;
}
Costantino Grana
  • 3,132
  • 1
  • 15
  • 35
2

You're only processing the first 32768 lines of the file, so you don't need to read any more than that. However, you also said:

the pattern always repeats after line 32767

So I'm assuming what you really want is to process batches of 32768 lines. In either case, that means you only need space to store that many lines at a time, not the whole file all at once. That's only about 10MB, which can easily be set aside in a file scope variable.

So change the code to use a static buffer of that size, eliminating any dynamic allocation, then loop through reading 32768 lines at a time and processing them.

Also:

  • j<=jj,k<=kk,m<=mm doesn't do what you expect. You probably want j<=jj && k<=kk && m<=mm
  • The variables jj, kk, and mm are not very descriptive. Better to use names for the section start/end points.

With the above changes, you now have the following:

#include <stdlib.h>
#include <string.h>

#define MAX_LINE_LENGTH 300      // maximum length of a line in the input file
#define MAX_BATCH 32768          // the number of lines in an input batch

#define PERMO_START 5
#define PRC_START 10927
#define DLRETX_START 21849

#define PERMO_END 10923
#define PRC_END 21846
#define DLRETX_END 32767

char lines[MAX_BATCH][MAX_LINE_LENGTH];

int main()
{
   // Open the input file
   FILE *fp = fopen("input.txt", "r");
   if (fp == NULL)
   {
       printf("Failed to open the file.\n");
       return 1;
   }

   // Create output file
   FILE *outputFile = fopen("output.txt", "w");
   if (outputFile == NULL)
   {
       perror("Error creating output file");
       return 1;
   }

   int done = 0;
   do {
       memset(lines, 0, sizeof lines);

       // Read lines from the file and store them in lines array
       int lineIndex = 0;
       char line[MAX_LINE_LENGTH];
       while ((lineIndex < MAX_BATCH) && (fgets(line, sizeof(line), fp) != NULL))
       {
           // Remove newline character from the end of the line
           if (line[strlen(line) - 1] == '\n')
               line[strlen(line) - 1] = '\0';
           strcpy(lines[lineIndex], line);
           lineIndex++;
       }

       done = lineIndex < MAX_BATCH;

       // Append lines
       for (int j=PERMO_START, k=PRC_START, m=DLRETX_START;
                j<=PERMO_END && k<=PRC_END && m<=DLRETX_END;
                j++,k++,m++)
       {
           fprintf(outputFile, "%s%s%s\n", lines[j], lines[k], lines[m]);
       }
   } while (!done);

   // Close the input and output file
   fclose(fp);
   fclose(outputFile);

   return 0;
}
dbush
  • 205,898
  • 23
  • 218
  • 273