I am writing a program to parse a file to another format. The input file is 20Gb in size so I have turned to C for parsing it, however when my output file reaches 4.3Gb (this is around the 41 second mark) the program gives a segmentation fault.
When tailing the output file it shows me that it has stopped giving output mid writing.
The input file is located at ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/ where the file is zipped as idmapping.dat.gz
The program is expected to parse the whole file and not just give a segmentation error.
int main()
{
char line[256];
char placeholdertoken[256];
char placeholderline[256];
char *token1, *token2, *token3;
char *chdup;
char *tab, *newline, *semicolom, *empty;
FILE *fp;
FILE *fs;
fp = fopen("idmapping.dat", "r");
fs = fopen("parsedidmapping.dat", "w");
if( fp == NULL )
{
perror("Error while opening the file.\n");
exit(EXIT_FAILURE);
}
strcpy(tab,"\t");
strcpy(newline,"\n");
strcpy(semicolom,";");
strcpy(empty,"");
strcpy(placeholdertoken,"");
while (fgets(line, sizeof(line), fp) != NULL)
{
token1 = strtok(line, "\t");
token2 = strtok(NULL, "\t");
token3 = strtok(NULL, "\n");
if (strcmp(token1, placeholdertoken) == 0) {
strcat(placeholderline, token2);
strcat(placeholderline, semicolom);
strcat(placeholderline, token3);
strcat(placeholderline, tab);
}
else {
strcat(placeholderline, newline);
strcpy(placeholdertoken,token1);
fputs(placeholderline, fs);
strcpy(placeholderline, empty);
strcat(placeholderline, token1);
strcat(placeholderline, tab);
strcat(placeholderline, token2);
strcat(placeholderline, semicolom);
strcat(placeholderline, token3);
strcat(placeholderline, tab);
}
}
fclose(fs);
fclose(fp);
return 0;
}