-2

I have a struct of strings with 3 million lines. I am trying to sort the file like:

aaaaa

aaaab

aaacc

And so on.

I was trying to do bubblesort. I tried it with 10 lines and it worked, but when I tried the whole 3 million lines file it took over 30 minutes and was still processing. I decided to try quicksort. However, I am running into a problem where it says:

expected 'const char **' but argument is of type 'struct lines *'

How can I fix this? Here is what I am doing:

#include<stdio.h>
#include<string.h>
#include <stdlib.h>
#include <math.h>
#include <stdbool.h>
#include <ctype.h>

void swap_str_ptrs(char const **arg1, char const **arg2)
{
    const char *tmp = *arg1;
    *arg1 = *arg2;
    *arg2 = tmp;
}

void quicksort_strs(char const *args[], unsigned int len)
{
    unsigned int i, pvt=0;

    if (len <= 1)
        return;

    // swap a randomly selected value to the last node
    swap_str_ptrs(args+((unsigned int)rand() % len), args+len-1);

    // reset the pivot index to zero, then scan
    for (i=0;i<len-1;++i)
    {
        if (strcmp(args[i], args[len-1]) < 0)
            swap_str_ptrs(args+i, args+pvt++);
    }

    // move the pivot value into its place
    swap_str_ptrs(args+pvt, args+len-1);

    // and invoke on the subsequences. does NOT include the pivot-slot
    quicksort_strs(args, pvt++);
    quicksort_strs(args+pvt, len - pvt);
}

void main()
{
    FILE *dnaFile=fopen("hs_alt_HuRef_chr2.fa", "r"); //file im reading
    typedef struct lines
    {
        char lines[100]; //size of each line
    } lines;
    int i = 0;

    char buf[256];
    static lines myDNA[3354419]; //creates the 3m spots for all lines
    while (fgets (buf, sizeof(buf), dnaFile))
    {
        if (i > 0)
            strcpy(myDNA[i].lines, buf); //inserting each line into the struct array

        i++;
    }

    // this is the bubblesort approach, works, but it takes too lon
    /**int a;
    int total;
    char temp[150];
    char report[100][150];

    for(a=0; a<3354419; a++)
    {
        for(total=a+1; total<=3354419; total++)
        {
            if(strcmp(myDNA[a].lines,myDNA[total].lines)>0)
            {
                strcpy(temp,myDNA[a].lines);
                strcpy(myDNA[a].lines,myDNA[total].lines);
                strcpy(myDNA[total].lines,temp);
            }
        }
    }*/

    quicksort_strs(myDNA, 3354419); //attempt at quicksort, which crashes

}

USING QSORT

#include<stdio.h>
#include<string.h>
#include <stdlib.h>
#include <math.h>
#include <stdbool.h>
#include <ctype.h>

int compare_function(const void *a,const void *b) {
return (strcmp((char *)a,(char *)b));
}

void main()
{
    FILE *dnaFile=fopen("hs_alt_HuRef_chr2.fa", "r"); //file with 3 million lines
    typedef struct lines
    {
        char lines[100];
    } lines;
    int i = 0;

    char buf[256];
    static lines myDNA[3354419]; // array holding the 3 million lines
    while (fgets (buf, sizeof(buf), dnaFile))
    {
        if (i > 0)
            strcpy(myDNA[i].lines, buf); //putting each line into array

        i++;
    }

    qsort(myDNA, 1000, 100, compare_function); //qsort works for first 1k lines, after, messed up

    int a;
    for (a = 0; a < 1000; a++){
    printf("%s", myDNA[a].lines); //printing lines
    }

}
user3386436
  • 39
  • 1
  • 6
  • 1
    Use the standard lib http://stackoverflow.com/questions/1787996/c-library-function-to-do-sort – David Heffernan Jul 20 '14 at 03:59
  • @DavidHeffernan is there a way to use qsort for 3 million lines? The qsort works for about the first 1k lines, but then the outputs starts becoming trash and it doesn't work. – user3386436 Jul 20 '14 at 04:38
  • 1
    Pretty sure the standard qsort works. Show us your code for using it. – Dwayne Towell Jul 20 '14 at 04:52
  • 1
    It may be undefined behavior to copy a maximum of 256 characters (`sizeof(buf)`) into a character array of 100 bytes (`sizeof(lines)`). Also, I suggest making `myDNA` as a 2D array of characters: `char myDNA[3354419][100];` unless you have a reason to use a struct. – millinon Jul 20 '14 at 05:29
  • @millinon I took your advice to change myDNA to a 2D array, however, if I change the size of buf from 256 to 100, it crashes. In fact, it crashes unless I make it above 200. Any ideas on how to get around that? – user3386436 Jul 20 '14 at 05:44
  • 1
    Well, if you have lines in the file that are longer than 100 (or 200?) characters, then it makes sense to have `char myDNA[3354419][256]`, so that each line has 256 bytes available. That means that the same length will have to be updated in your `qsort` call. As an aside, that's a pretty good use case for a preprocessor directive: `#define LINE_LEN 256`, so that if you want to change the length, you only need to change it in one place. – millinon Jul 20 '14 at 05:52
  • @millinon I found out the problem. What happens is that the first line of the file is about 200 characters, which should be ignored, then the rest of the file is 80 characters on each line, which should be sorted. Is there a problem with storing 80 characters in an array size 200? – user3386436 Jul 20 '14 at 06:00
  • 1
    Since `strcmp` uses `strlen`, it shouldn't matter if there are unused bytes after a line. However, since that's all just a workaround to discard the first line of the file, it might make more sense to just do one `fgets` to get ~200 characters, then start reading into the array. After all, you only start writing into `myDNA[1]`, but `qsort` will expect `myDNA[0]` to be part of the data to be sorted. – millinon Jul 20 '14 at 06:10
  • @millinon I took your advice to use fgets to get the first line, and now the second line of the file is into myDNA[0]. However, the program is still crashes if I try to make buf[80] or even buf[90]. Also, qsort is still giving me trash for the first few lines when I use buf[200]. – user3386436 Jul 20 '14 at 06:31
  • Your code is obviously wrong. Try debugging. – David Heffernan Jul 20 '14 at 06:34
  • @DavidHeffernan Your comment is very helpful, thanks. Have a nice day. – user3386436 Jul 20 '14 at 06:38
  • @millinon Ignore what I previously have said. First of all, the code actually works, the problem was that the first line was repeated throughout the whole text file, that is why it was printing the first line and I thought it was an error. Second, thats why I kept getting errors when I tried to lower to [80], which would crash when the ~200 line would come. It pretty much works now, if you want leave an answer and I'll pick your answer. Thanks. – user3386436 Jul 20 '14 at 06:42
  • See, my advice got it done. You did some debugging and solved your problem. Since only you have your data, obviously only you can debug the problem. – David Heffernan Jul 20 '14 at 07:20
  • 'Debug it' is pretty vague advice here, considering that the program was compiling and running without obvious errors. If someone's code is segfaulting, it's easy to tell them to compile it with -g and gdb it to find a line number. Otherwise, debugging is going to mean printing out a bunch of junk and manually searching through it to find the mistake. I'd say that asking for help is the more efficient way to solve the problem, in this case. – millinon Jul 20 '14 at 09:09
  • @millinon Learning basic debugging skills will be very useful. And indeed some debugging did the trick. – David Heffernan Jul 20 '14 at 21:31

1 Answers1

0

I have modified the question code a bit. From my testing, the following code appears to function as needed (as stated by the question).

#include <stdio.h>  // printf(), fprintf(), fclose(), feof(), fgets(), fopen()
#include <string.h> // memset(), strcmp(), strdup()
#include <stdlib.h> // malloc(), qsort(), free()
#include <errno.h>  // errno, ENOMEM, EIO

#define MAX_FILE_LINES 3354419
#define MAX_LINE_SIZE  (255+1)

int compare_function(const void *a, const void *b)
   {
   return(strcmp(*(const char **)a, *(const char **)b));
   }

int main(int argC, char *argV[])
   {
   int    rCode    = 0;
   char  *filePath = "hs_alt_HuRef_chr2.fa";
   FILE  *dnaFile  = NULL;
   char **myDNA    = NULL;
   int    myDNAcnt = 0;
   int    index;

   /** Allow user to specify the file path on the command-line. **/
   if(argC > 1)
      filePath=argV[1];

   /** Allocate an array (to hold the 3 million lines). **/
   errno=0;
   myDNA=malloc(MAX_FILE_LINES * sizeof(*myDNA));
   if(NULL == myDNA)
      {
      rCode=errno?errno:ENOMEM;
      fprintf(stderr, "malloc() failed. errno:%d\n", errno);
      goto CLEANUP;
      }
   memset(myDNA, 0, MAX_FILE_LINES * sizeof(*myDNA));

   /** Open the file. **/
   errno=0;
   dnaFile=fopen(filePath, "r");
   if(NULL == dnaFile)
      {
      rCode=errno;
      fprintf(stderr, "fopen() failed to open \"%s\". errno:%d\n", filePath, errno);
      goto CLEANUP;
      }

   /** Read the file into the array, allocating dynamic memory for each line. **/
   for(myDNAcnt=0; myDNAcnt < MAX_FILE_LINES; ++myDNAcnt)
      {
      char buf[MAX_LINE_SIZE];
      char *cp;

      if(NULL == fgets(buf, sizeof(buf), dnaFile))
         {
         if(feof(dnaFile))
            break;

         rCode=EIO;
         fprintf(stderr, "fgets() failed.\n");
         goto CLEANUP;
         }

      cp=strchr(buf, '\n');
      if(cp)
         *cp='\0';

      errno=0;
      myDNA[myDNAcnt] = strdup(buf);
      if(NULL == myDNA[myDNAcnt])
         {
         rCode=errno;
         fprintf(stderr, "strdup() failed. errno:%d\n", errno);
         goto CLEANUP;
         }
      }

   /** Sort the array. **/
   qsort(myDNA, myDNAcnt, sizeof(*myDNA), compare_function);

   /** Print the resulting sorted array. **/
   for(index=0; index < myDNAcnt; index++)
      {
      printf("%8d: %s\n",index,  myDNA[index]); //printing lines
      }

CLEANUP:

   /** Close the file. **/
   if(dnaFile)
      fclose(dnaFile);

   /** Free the array. **/
   if(myDNA)
      {
      for(index=0; index < myDNAcnt; index++)
         {
         free(myDNA[index]);
         }

      free(myDNA);
      }

   return(rCode);
   }
Mahonri Moriancumer
  • 5,993
  • 2
  • 18
  • 28
  • Goto in "C" ? is a bad practise –  Dec 04 '16 at 02:09
  • @delive, in C, the goto statement is the ***BEST*** way to handle error conditions, and is very popular in state machines. Perhaps whoever told you that goto is a bad practice has mislead you; at least in this case. – Mahonri Moriancumer Dec 05 '16 at 15:57
  • I'm confusing now http://stackoverflow.com/questions/46586/goto-still-considered-harmful –  Dec 06 '16 at 10:44