0

I'm trying to apply some previously written code to some DNA sequencing data that I have collected. I would like to match my sequencing reads to a document containing specific barcodes and output the number of hits for each match. I am having some very basic trouble with calling the two files to compare. The code compiles, however, when I try to call the file "MOBYtest" in line 77, the output is 0, which indicates that the file is not being called appropriately. Does anyone have any insight into how I can incorporate the two files into the following code?

####################################################################################################
#### Supplementary File 2, barcodebin.c ############################################################
####
#### Barcode assigned to a gene using standard binary search
####
#### Can Alkan
#### 2010
####################################################################################################
*/
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

typedef struct spair{
  char *name;
  char *seq;
  int count;
}_spair;


int binSearch(char *seq, int s, int e);
void revcomp(char *s, char *r);

static int maxfunc(const void *p1, const void *p2);

struct spair **seqs;
int seqlen;

int main(int argc, char **argv){
  FILE *fp;
  int i;
  char fname[100];
  int nseq;
  char sequence[100]; char name[100];
  char sname[100];
  char revseq[100];
  int retCode=100000;
  int s, e; int olds;
  int found;
  int crop;
  int len;
  int longest_barcode=0;
  int fastq=0;
  char line[1000];
  int loop_done=0;
  int crop_lim;
  int min_barcode = 5;
  int verbose = 0;
  
  fname[0]=0; nseq = 0; sname[0]=0;
  for (i=1;i<argc;i++){
    if (!strcmp(argv[i], "-i"))
      strcpy(fname, argv[i+1]);
    else if (!strcmp(argv[i], "-s"))
      strcpy(sname, argv[i+1]);
    else if (!strcmp(argv[i], "-n"))
      nseq = atoi(argv[i+1]);
    else if (!strcmp(argv[i], "-m"))
      min_barcode = atoi(argv[i+1]);
    else if (!strcmp(argv[i], "-v"))
      verbose = 1;
  }

  if (fname[0]==0 || nseq==0 || sname[0]==0) return 0;

  fp = fopen(fname, "MOBYtest");
  if ( !fp ) {
    fprintf(stderr, "Could not open file %s\n", fname);
    return 255;
  }
    

  i=0;

  seqs = (struct spair ** ) malloc (sizeof (struct spair *) * nseq);
 
  if (verbose) {
    fprintf(stderr, "Loading  %d sequences ..", nseq);
  }

  while (fscanf(fp, ">%s\n%s\n", name, sequence) > 0 ){
    seqlen=strlen(sequence);
    if (seqlen > longest_barcode) longest_barcode = seqlen;
    if (seqlen>=3){
      seqs[i] = (struct spair * ) malloc (sizeof (struct spair));
      seqs[i]->name = (char *) malloc (sizeof (char) * (strlen (name)+1));
      seqs[i]->seq = (char *) malloc (sizeof (char) * (2*seqlen+1));
      seqs[i]->count = 0;

      strcpy(seqs[i]->seq, sequence);
      strcpy(seqs[i]->name, name);

      i++;
    }
  }

  nseq = i;

  qsort(seqs, nseq, sizeof(struct spair *), maxfunc);

  if (verbose) {
    fprintf(stderr, "  done. %d left. Longest barcode: %d bp\n", nseq, longest_barcode);
  }

  fclose(fp);

  fp = fopen(sname, "r");
  if ( !fp ) {
    fprintf(stderr, "Could not open file %s\n", sname);
    return 255;
  }


  while (fscanf(fp, "%s\n%s\n", name, sequence) > 0){



    if (name[0]=='@'){
      fgets(line, 1000, fp); // + line
      fgets(line, 1000, fp); // quality line
    }


    sequence[longest_barcode]=0;

    strcpy(revseq, sequence);
    s = 0, e = nseq; retCode = 10000; olds=-10;
    found=0;


    crop = 0;
    len = strlen(revseq);

    crop_lim = longest_barcode - min_barcode + 1; // ad hoc

    while (crop<crop_lim){
      revseq[len-crop]=0;

      while (s<e && retCode!=0){
    retCode=binSearch(revseq, s, e);
    if (retCode < 0){e = (s+e)/2;}
    else if (retCode > 0){olds=s; s = (s+e)/2;}
    
    else if (retCode == 0){

      seqs[(s+e)/2]->count++;
      crop = crop_lim+1;
      break;
    }
    
    if (s>=e || s==olds){

      crop++;

      s = 0, e = nseq; retCode = 10000; olds=-10;
      break;

    }
    
      }
    }

  }
 
  if (verbose) {
    fprintf(stderr, "  done.\n");
  }
  fprintf(stdout, "Barcode_Name\tBarcode_Length\tFrequency\n");
  for (i=0;i<nseq;i++)
    fprintf(stdout, "%s\t%d\t%d\n", seqs[i]->name, strlen(seqs[i]->seq), seqs[i]->count);
}

int binSearch(char *seq, int s, int e){
  int med = (s+e)/2;
  
  int ret;

  ret = strcmp(seq, seqs[med]->seq);

  return ret;
}

void revcomp(char *s, char *r){
  int len = strlen(s);
  int  i;

  for (i=0;i<len;i++){
    switch(s[i]){
    case 'A': r[len-i-1] = 'T'; break;
    case 'T': r[len-i-1] = 'A'; break;
    case 'C': r[len-i-1] = 'G'; break;
    case 'G': r[len-i-1] = 'C'; break;
    default:  r[len-i-1] = s[i]; break;
    }
  }
  r[len]=0;
}


static int maxfunc(const void *p1, const void *p2){
  struct spair *a, *b;

  int ret;
  a = *((struct spair **)p1);
  b = *((struct spair **)p2);

  ret = strcmp(a->seq, b->seq);

  return ret;


}
DaggeJ
  • 2,094
  • 1
  • 10
  • 22
  • 1
    "I am having some very basic trouble with calling the two files to compare" <- Please elaborate regarding what you are trying to do that fails. Does this code compile? What happens when you invoke the compiled program? – einpoklum Jan 26 '22 at 19:17
  • Yes, the code compiles. There are two files I would like to call, one is MOBYtest which contains my sequencing reads and the second is the list of barcodes to match it to. I have attempted to call the file "MOBYtest" however, I am only getting an output of 0 when I run the code, which indicates that the file is not being read. I need it to call two files, however, I only see one obvious spot where the code is calling for a file. The code was published on a similar experiment as mine, so I think it should work for my purposes, I just need to start by understanding how to call each file Thanks! – Joe Armstrong Jan 26 '22 at 19:22
  • Please put this extra text into your answer. Also use the preformatted-paragraph styling to show us your command-line, exactly. Then write what the printed output was, and what the return value was (if you know how to check the return value). – einpoklum Jan 26 '22 at 19:25
  • Thank you, einpoklum, I really appreciate your help. I have tried to edit my question to include the information you requested, but I am unsure how to check a return value. – Joe Armstrong Jan 26 '22 at 19:30
  • I think I figured out what's wrong. – einpoklum Jan 26 '22 at 19:35
  • Replace `fprintf(stderr, "Could not open file %s\n", fname);` with `perror(fname)`, and the error message may be much more useful. – William Pursell Jan 26 '22 at 19:43
  • Thank you William Pursell, I replaced the line you suggested, but I still am getting the same output. – Joe Armstrong Jan 26 '22 at 19:46

1 Answers1

1

Your problem is here:

fp = fopen(fname, "MOBYtest");

that's not how you call fopen(). If you look at the man page for fopen() you'll see the signature:

FILE *fopen(const char *pathname, const char *mode);

So, you don't "plug" your filename into the program, you pass it when invoking, i.e. if you compiled this file as my_prog, you would execute something like:

my_prog -i "MOBYtest" -s "my_list_of_barcodes_I_guess" -n 100

and the MOBYtest string would come into fopen() via the pathname parameter. Now, the mode for opening the file is one or "r", "w", a" etc., which correspond to "read", "write", "append" etc. Again, see the man page.


Some more advice:

  • Always turn the warning flags on during compilation. You definitely got some warnings about your code - heed them.
  • Use common/standard facilities for common tasks such as argument parsing - don't write your own parsing code. Specifically, you could use getopt, available on any POSIX system; or maybe try argparse (a library available on GitHub, could be a bit challenging for newbie to set up and include in the build).
einpoklum
  • 118,144
  • 57
  • 340
  • 684