-4

Stuck at if block right below //step 5, the issue is that the code will not progress into or after the given if block. I need to figure out how to get this particular issue settled before starting the task of generating parallel code. If you run the code you will see one print statement that indicates the value of "one" and another two for "i" and "j". After the if block begins, none of the other print statements are hit. As a result I am quite stuck, I am aware that this is a specific issue, however, I cannot seem to determine it's cause.

Any help is appreciated! Thanks in advance!

Input file sample.

>386.fasta.screen.Contig1
GAGTTTGATCCTGGCTCAGAATCAACGCTGGCGGCGCGCTTAACACATGC
AAGTCGAACGAGAAAGTGGAGCAATCCATGAGTACAGTGGCGTACGGGTG
AGTAACACGTGGGTAATCTACCTCTTAGTGGGGAATAACTTTGGGAAACC
GAAGCTAATACCGCATAAGCTCGAGAGAGGAAAGCAGCAATGCGCTGAGA
GAGGAGCCCGCGGCCGATTAGCTAGTTGGCAGGGTAAAAGCCTACCAAGG
CAGAGATCGGTAGCCGGCCTGAGAGGGCACACGGCCACACTGGCACTGAA
ACACGGGCCAGACTCCTACGGGAGGCAGCAGTGGGGAATCTTGCACAATG
GGGGCAACCCTGATGCAGCGACGCCGCGTGAGCGATGAAGCCCTTCGGGG
TGTAAAGCTCTTTCGTCAGGGAAGATAGTGACGGTACCTGGAGAAGCAGC
TGCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGCAGCGAGCGT
TGTTCGGAGTTACTGGGCGTAAAGGGTGTGTAGGCGGTTGTTTAAGTTTG
GTGTGAAATCTCCCGGCTCAACTGGGAGGGTGCGCCGAATACTGAGCGAC
TAGAGTGCGGGAGAGGAAAGTGGAATTCCTGGTGTAGCGGTGAAATGCGT
AGATATCAGGAGGAACACCGGTGGTGTAGACGGCTTTCTGGACCGTAACT
GACGCTGAGACACGAAAGCGTGGGTAGCAAACAGGATTAGATACCCTGGT
AGTCCACGCCCTAAACGATGCATATTTGGTGTGGGCAGTTCATTCTGTCC
GTGCCGGAGCTAACGCGTTAAATATGCCGCCTGGGGAGTACAGTCGCAAG
GCTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGT
GGTTTAATTCGACGCAACGCGAAGAACCTTACCTGGGCTCGAACGGCTTC
CCAACGCCGGTAGAAATATCGGTACCCCGCAAGGGGGTGGAATCGAGGTG
CTGCATGGCTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGC
AACGAGCGCAACCCTTGTCCTGTGTTGCCATGCCGCAAGGCGGCACTCGC
AGGAGACCGCCAGCGATAAGCTGGAGGAAGGTGGGGATGACGTCAAGTCC
TCATGGCCTTTATGTCCAGGGCTACACACGTGCTACAATGGCCGGTACAA
AGCGTCGCTAACCTGCGAAGGGGAGCCAATCGCAAAAAACCGGTCTCAGT
TCGGATTGCAGGCTGCAACCCGCCTGCATGAAGCTGGAATCGCTAGTAAT
GGCAGATCAGCACGCTGCCGTGAATACGTTCCCGGGCCTTGTACACACAT

/********************************
Based on code by:
Lorenzo Seidenari (sixmoney@virgilio.it)
*********************************/

#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>

#define MAX_SEQUENCE_LENGTH 100000

int  n; 
int  m;
int levenshtein_distance(char *s,char*t);
int minimum(int a,int b,int c);

//-----------------------------------------------------------------------------
void cleanString(char string[]) {
  //Removes all spaces from string pointed to by "string", converts characters
  //to uppercase, and deletes a terminating newline character.
    int i, current;
    int length = strlen(string);

    current = 0;
    for(i=0;i<length;i++) {
        if(string[i]=='\n') {
            string[current++] = '\0';
            break;
        }
        else if(string[i]!=' ') {
            string[current++] = toupper(string[i]);
        }
    }
}
//-----------------------------------------------------------------------------
int importFASTA(char *filename, char *sequence) {
  //Reads a file, located at path specified by "filename", containing a FASTA
  //sequence. It finds the first full, complete sequence in the file, stores
  //it in "sequence", and returns the length of the sequence, or -1 on failure.
    FILE *fastaFile;
    char input[256];
  int readFlag; //set to 1 once a sequence has been read in
  int length;

  //open the file
  if((fastaFile = fopen(filename, "r")) == NULL) {
    return -1;
  }

  sequence[0] = '\0';

  //read the full first sequence, discarding unnecessary headers
  readFlag=0;
  length = 0;
  while(fgets(input,256,fastaFile)!=NULL) {
    //is it a header or a comment?
    if(input[0]=='>' || input[0]==';') {
        if(readFlag) break;
        else continue;
    }
    else readFlag = 1;

    cleanString(input);
    length += strlen(input);

    strncat(sequence,input,MAX_SEQUENCE_LENGTH-length - 1);
  }
  //Add a terminatng null character, just in case
  sequence[length] = '\0';

  fclose(fastaFile);
  return length;
}


/****************************************/
/*Implementation of Levenshtein distance*/
/****************************************/

__global__ void levenshtein_distance(char *s,char*t, int one, int two)
/*Compute levenshtein distance between s and t*/
{
    //Step 1
    int k,i,j,cost,*d;
    int distance = 0;
    if(one!=0&&two!=0)
    {
        d=(int *)malloc((sizeof(int))*(two+1)*(one+1));
        two++;
        one++;
        //Step 2    
        for(k=0;k<one;k++){
            d[k]=k;
        }
        for(k=0;k<two;k++){
            d[k*one]=k;
        }
        //Step 3 and 4  
        for(i=1;i<one;i++){
            for(j=1;j<two;j++)
            {
                //Step 5
                printf("%d  %d  %d\n", one, i, j);
                if(s[i-1]==t[j-1]){
                    cost=0;
                    printf("%d  %d  %d\n", one, i, j);
                }
                else{
                    cost=1;
                    printf("%d  %d  %d\n", one, i, j);
                }
                printf("%d  %d  %d\n", one, i, j);
                //Step 6
                int min = d[(j-1)*one+i]+1;
                if (d[j*one+i-1]+1 < min)
                    min = d[j*one+i-1]+1;
                if (d[(j-1)*one+i-1]+cost < min)
                    min = d[(j-1)*one+i-1]+cost;
                d[j*one+i] = min;        
            }
            distance=d[one*two-1];
            free(d);
            printf("%d\n", distance);
        }
    }
        else
            printf ("-1");
}

int main(int argc, char *argv[]) {
    char A[MAX_SEQUENCE_LENGTH+1];
    char B[MAX_SEQUENCE_LENGTH+1];

    if(argc < 3) {
        printf("Usage: new_edit_distance <sequence1> <sequence2>\n");
        printf("<sequence1>: file containing the first sequence, FASTA format\n");
        printf("<sequence2>: file containing the second sequence, FASTA format\n");
        return EXIT_FAILURE;
    }

    n = importFASTA(argv[1],A);
    m = importFASTA(argv[2],B);

    levenshtein_distance<<<1, 1>>>(A,B, n, m);
    cudaDeviceSynchronize();
    printf ("%s\n", cudaGetErrorString(cudaGetLastError()));

    return EXIT_SUCCESS;
}
  • Could you pare down the code to what's specifically relevant to your question? – Scott Hunter Nov 11 '14 at 01:58
  • they are not returning anything :-( –  Nov 11 '14 at 02:17
  • 1
    anon, in spite of the request by @ScottHunter, we don't want to replace an [MCVE](http://stackoverflow.com/help/mcve) with a little snippet. Probably (I hope), Scott was asking for a more condensed MCVE. I'd almost always rather have an MCVE than a snippet. And SO [explicitly requests that as well](http://stackoverflow.com/help/on-topic). I suggest at a minimum, roll back your edit. Then, if possible, condense the MCVE. – Robert Crovella Nov 11 '14 at 02:29
  • tada, back to the full report, I got the error to return it is "unspecified launch failure” –  Nov 11 '14 at 02:33
  • 1
    Your code requires input files. Can you provide or suggest any input files that others could use to see/reproduce the problem? Or can you create some additional code to synthesize valid input in lieu of loading files? – Robert Crovella Nov 11 '14 at 02:33
  • added an example file, can be used for both inputs –  Nov 11 '14 at 02:39

1 Answers1

0

I get it now. You took straight serial C/C++ code, dropped it into a kernel, intended to run that kernel as a single thread, and then want to proceed from there.

The idea is plausible, but you're missing a key fact about CUDA and GPUs: they can't directly access host memory.

So when you set up A and B like this:

char A[MAX_SEQUENCE_LENGTH+1];
char B[MAX_SEQUENCE_LENGTH+1];
....
n = importFASTA(argv[1],A);
m = importFASTA(argv[2],B);

those are ordinary variables that live in host memory. GPU (ordinary CUDA) code can't directly access host memory. So when you pass those pointers to a kernel like this:

levenshtein_distance<<<1, 1>>>(A,B, n, m);

the GPU code will try and dereference those A and B pointers and will fault (unspecified launch failure).

Every CUDA program has the following basic sequence:

  1. copy data to the GPU
  2. perform computations on the GPU
  3. copy results back

You've tried to do step 2 without step 1. It won't work.

Since I'm not able to run your program since I don't have valid input files, I'll make the following suggestion. I assume you know little or nothing about CUDA. Try adding lines like this:

n = importFASTA(argv[1],A);              // no change
m = importFASTA(argv[2],B);              // no change

char *d_A, *d_B;                          // add this line
cudaMalloc(&d_A, MAX_SEQUENCE_LENGTH+1);  // add this line
cudaMalloc(&d_B, MAX_SEQUENCE_LENGTH+1);  // add this line

cudaMemcpy(d_A, A, MAX_SEQUENCE_LENGTH+1, cudaMemcpyHostToDevice); // add 
cudaMemcpy(d_B, B, MAX_SEQUENCE_LENGTH+1, cudaMemcpyHostToDevice); // add

levenshtein_distance<<<1, 1>>>(d_A,d_B, n, m); //modify parameters

n and m don't need to be handled any differently since you are passing those by value.

And add proper cuda error checking to your code.

EDIT: after some further analysis, it's clear that this sequence is not correct:

        distance=d[one*two-1];
        free(d);
        printf("%d\n", distance);
    }
}

You are freeing d on every iteration of the i loop. That cannot possibly be correct. I suggest you go back to square one and get your serial code working first, in ordinary serial C code, before dropping it into a cuda kernel this way. If you move that free statement outside the i loop, then your kernel runs for a very very long time. Be advised that in-kernel printf is limited in the amount of output that can be easily generated.

I'm not going to debug your code any further for you. Get your serial code working first, then figure out a way to create a kernel without massive quantities of printout.

A final comment: I said above your approach is "plausible". That it means it could be made to work, i.e produce the same behavior as the same code executing on the host. It does not mean it will run fast. This is not how you get acceleration out of a GPU (running a single block of a single thread). I assume you already know this based on your comment "how to get this particular issue settled before starting the task of generating parallel code." But I think the disclaimer is appropriate anyway.

Community
  • 1
  • 1
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • Thanks, that helped a lot! However, I am only getting up to "2" on "i" variable, so it appears that not enough memory is being alocated –  Nov 11 '14 at 02:50
  • Yes, there is an additional problem in your code that has to do with the `d` variable in the kernel that is being accessed out-of-bounds. – Robert Crovella Nov 11 '14 at 03:18
  • This line in your kernel code is mis-positioned: `free(d);` You are freeing it on each iteration of the `i` loop. That is not what you want, and the code you lifted this from could not be correct either. – Robert Crovella Nov 11 '14 at 03:45