CUDA DE kernel not launching

Question

I'm trying to do differential evolution on CUDA, but the problem is that kernel which is responsible for "Mutation, Crossover, Evaluation, Selection" never gets launched.

Any help?

Here's the entire code:

#include <iostream>
#include <curand_kernel.h>
using namespace std;

/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        system("pause");
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;


/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {

    float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
                          + (a[1] - b[1]) * (a[1] - b[1])
                          + (a[2] - b[2]) * (a[2] - b[2]));
    float distance6 = distance * distance * distance
                      * distance * distance * distance;
    float distance12 = distance6 * distance6;
    return 1/distance12 - 2/distance6;
}

/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
        curand_init(seed, id, 0,&state[id]);
} 

/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
    {
        //init, just populating array with some specific numbers
        population[D*id]=0;
        population[D*id+N]=0;
        population[D*id +2*N]=0;

        population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
        population[D*id+N+1]=0;
        population[D*id +2*N+1]=0;

        for(int i=2; i<N; i++){
            float min= -4 - 1/4*abs((int)((i-4)/3));
            float max= 4 + 1/4*abs((int)((i-4)/3));
            if(i==2)
            {
                population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
                population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+2]=0;
            }
            else
            {
                population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
            }
        }

        //eval
        float e=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
                e += lennardJones(a,b);
            }
        }
        population[D*id + D-1]=e;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id<NP)
    {
        int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
        while(a == id){a = rndInt(globalState, id, NP);}
        while(b == a && b==id){b=rndInt(globalState, id, NP);}
        while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
        mutation[D*id+0]=a;
        mutation[D*id+1]=b;
        mutation[D*id+2]=c;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;  
    if(id<NP)
    {
        int a=mutation[D*id+0], b=mutation[D*id+1], c=mutation[D*id+2];

        //DE mutation and crossover
        int j=rndInt(globalState, id, NP);
        for(int i=0; i<D-1; i++)
        {
            //DE mutation
            pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
            //DE crossover
            if(Cr > rndFloat(globalState, id) && i!= j)
                pop[D*id+i]=population[D*id +i];
        }
        // Eval
        pop[D*id+D-1]=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
                pop[D*id+D-1] += lennardJones(a,b);
            }
        }

        __syncthreads();
        //DE selection
        if(pop[D*id+D-1] < population[D*id +D-1])
        {
            for(int i=0; i<D; i++)
                population[D*id +i]=pop[D*id+i];
        }
    }
}

void getBestScore(float *hpopulation)
{
    int max=0;
    for(int i=1; i<hNP; i++)
    {
        if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
            max=i;
    }
    for(int j=0; j<hN; j++)
        cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
    cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}

int main()
{
    cudaEvent_t start,stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));  
    HANDLE_ERROR(cudaEventRecord(start,0));

    int device, st=100;
    float hCr=0.6f, hF=0.8f;
    cudaDeviceProp prop;
    HANDLE_ERROR(cudaGetDevice(&device));
    HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
//  int SN = prop.maxThreadsPerBlock; //512 threads per block
    //int SB = (hNP+(SN-1))/SN;


    //constants NP, D, N, Cr, F
    HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
    HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));

    //seeds
    curandState* devStates;
    HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
    setup_kernel <<< 1, hNP>>> (devStates, 50);

    //population
    float *population, *pop;
    float hpopulation[hNP*hD];
    HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
    HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));

    //mutation
    int *mutation, *mutation1;
    int *hmutation;
    HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
    HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));

    //stream
    cudaStream_t stream_i, stream_j;
    HANDLE_ERROR(cudaStreamCreate(&stream_i));
    HANDLE_ERROR(cudaStreamCreate(&stream_j));

    kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
    kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);


    while(st != 0)
    {
        /*** COPYING MUTATION INDICES***/
        HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
        HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));

        /**** CALLING KERNELS****/
        kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
        kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);

        st--;

        //HANDLE_ERROR(cudaStreamSynchronize(stream_i));
        //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
        //getBestScore(hpopulation);
        //cin.get();
    }

    HANDLE_ERROR(cudaStreamSynchronize(stream_i));
    HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
    getBestScore(hpopulation);

    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    float time;
    HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
    cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;

    HANDLE_ERROR(cudaEventDestroy(start));
    HANDLE_ERROR(cudaEventDestroy(stop));
    HANDLE_ERROR(cudaStreamDestroy(stream_i));
    HANDLE_ERROR(cudaStreamDestroy(stream_j));
    HANDLE_ERROR(cudaFree(population));
    HANDLE_ERROR(cudaFree(pop));
    HANDLE_ERROR(cudaFreeHost(hmutation));
    HANDLE_ERROR(cudaFree(mutation1));
    HANDLE_ERROR(cudaFree(devStates));

    system("pause");
    return 0;
}

UPDATE - Solution:

#include <iostream>
#include <curand_kernel.h>
using namespace std;

/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        system("pause");
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;


/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {

    float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
                          + (a[1] - b[1]) * (a[1] - b[1])
                          + (a[2] - b[2]) * (a[2] - b[2]));
    float distance6 = distance * distance * distance
                      * distance * distance * distance;
    float distance12 = distance6 * distance6;
    return 1/distance12 - 2/distance6;
}

/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
        curand_init(seed, id, 0,&state[id]);
} 

/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
    {
        //init, just populating array with some specific numbers
        population[D*id]=0;
        population[D*id+N]=0;
        population[D*id +2*N]=0;

        population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
        population[D*id+N+1]=0;
        population[D*id +2*N+1]=0;

        for(int i=2; i<N; i++){
            float min= -4 - 1/4*abs((int)((i-4)/3));
            float max= 4 + 1/4*abs((int)((i-4)/3));
            if(i==2)
            {
                population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
                population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+2]=0;
            }
            else
            {
                population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
            }
        }

        //eval
        float e=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
                e += lennardJones(a,b);
            }
        }
        population[D*id + D-1]=e;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id<NP)
    {
        int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
        while(a == id){a = rndInt(globalState, id, NP);}
        while(b == a && b==id){b=rndInt(globalState, id, NP);}
        while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
        mutation[3*id+0]=a;
        mutation[3*id+1]=b;
        mutation[3*id+2]=c;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;  
    if(id<NP)
    {
        int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];

        //DE mutation and crossover
        int j=rndInt(globalState, id, NP);
        for(int i=0; i<D-1; i++)
        {
            //DE mutation
            pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
            //DE crossover
            if(Cr > rndFloat(globalState, id) && i!= j)
                pop[D*id+i]=population[D*id +i];
        }
        // Eval
        pop[D*id+D-1]=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
                pop[D*id+D-1] += lennardJones(a,b);
            }
        }

        __syncthreads();
        //DE selection
        if(pop[D*id+D-1] < population[D*id +D-1])
        {
            for(int i=0; i<D; i++)
                population[D*id +i]=pop[D*id+i];
        }
    }
}

void getBestScore(float *hpopulation)
{
    int max=0;
    for(int i=1; i<hNP; i++)
    {
        if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
            max=i;
    }
    for(int j=0; j<hN; j++)
        cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
    cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}

int main()
{
    cudaEvent_t start,stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));  
    HANDLE_ERROR(cudaEventRecord(start,0));

    int device, st=100;
    float hCr=0.6f, hF=0.8f;
    cudaDeviceProp prop;
    HANDLE_ERROR(cudaGetDevice(&device));
    HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
//  int SN = prop.maxThreadsPerBlock; //512 threads per block
    //int SB = (hNP+(SN-1))/SN;


    //constants NP, D, N, Cr, F
    HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
    HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));

    //seeds
    curandState* devStates;
    HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
    setup_kernel <<< 1, hNP>>> (devStates, 50);

    //population
    float *population, *pop;
    float hpopulation[hNP*hD];
    HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
    HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));

    //mutation
    int *mutation, *mutation1;
    int *hmutation;
    HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
    HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));

    //stream
    cudaStream_t stream_i, stream_j;
    HANDLE_ERROR(cudaStreamCreate(&stream_i));
    HANDLE_ERROR(cudaStreamCreate(&stream_j));

    kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
    kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);


    while(st != 0)
    {
        /*** COPYING MUTATION INDICES***/
        HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
        HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));

        /**** CALLING KERNELS****/
        kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
        kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);

        st--;

        //HANDLE_ERROR(cudaStreamSynchronize(stream_i));
        //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
        //getBestScore(hpopulation);
        //cin.get();
    }

    HANDLE_ERROR(cudaStreamSynchronize(stream_i));
    HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
    getBestScore(hpopulation);

    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    float time;
    HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
    cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;

    HANDLE_ERROR(cudaEventDestroy(start));
    HANDLE_ERROR(cudaEventDestroy(stop));
    HANDLE_ERROR(cudaStreamDestroy(stream_i));
    HANDLE_ERROR(cudaStreamDestroy(stream_j));
    HANDLE_ERROR(cudaFree(population));
    HANDLE_ERROR(cudaFree(pop));
    HANDLE_ERROR(cudaFreeHost(hmutation));
    HANDLE_ERROR(cudaFree(mutation1));
    HANDLE_ERROR(cudaFree(devStates));

    system("pause");
    return 0;
}

You can do [error checking on kernel launches](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) too. Why don't you do that and report back what kind of error you get? Also, what happens when you run your code with `cuda-memcheck`? My guess is your kernel is launching but you have some sort of error that occurs in device code, causing the kernel to terminate early. — Robert Crovella, May 29 '13 at 13:12
error checking on kernel shows no errors. cuda-memcheck also show no error. I also installed latest nsight with appropriate driver on VS 2010 but when I press "Start CUDA Debugging" nothing happens... breakpoints that I have placed inside kernel are ignored. I have no idea what to do next. — user755974, May 31 '13 at 20:01
Update the code you've posted to show how you are doing kernel error checking. Even better would be to post a complete, compilable code. Unless you have a logic error in your host code outside of what you have posted here, or are completely misinterpreting the data produced by your code, it's not possible for a kernel to not launch, throw no errors, and show no errors in cuda-memcheck. — Robert Crovella, May 31 '13 at 21:28
"error checking on kernel shows no errors". When I compile and run your code I get "unspecified launch failure at line 251". Furthermore, cuda-memcheck shows about 80 errors, many of which are invalid writes of size 4 in kernelP (which you completely omitted from your question!). Just run your code from the command line like this: `cuda-memcheck mycode` (assuming mycode.exe is the name of your executable) in whatever directory your app executable is located. For future reference, when adding info (like when you posted the full code) just edit your original question, don't post an answer. — Robert Crovella, May 31 '13 at 22:59
I don't know, no error for me: http://shrani.si/f/17/CG/4jkFqAQK/screenshot.png I have updated the code too. Tnx to you, I have found couple mistakes - invalid writes into global memory. — user755974, May 31 '13 at 23:41
vote to close as "too localized" -- since the OP has updated the question with functional code that no longer contains the original issue, it's unlikely to help future visitors. — Robert Crovella, Jun 01 '13 at 23:10
Please post your solution as an answer - it's OK in some cases to answer your own question. Then we can call this an answered question and move on. Thanks. — Robert Crovella, Jun 03 '13 at 13:26

score 1 · Answer 1 · answered Jun 04 '13 at 10:51

Solution:

#include <iostream>
#include <curand_kernel.h>
using namespace std;

/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        system("pause");
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;


/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {

    float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
                          + (a[1] - b[1]) * (a[1] - b[1])
                          + (a[2] - b[2]) * (a[2] - b[2]));
    float distance6 = distance * distance * distance
                      * distance * distance * distance;
    float distance12 = distance6 * distance6;
    return 1/distance12 - 2/distance6;
}

/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
        curand_init(seed, id, 0,&state[id]);
} 

/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
    {
        //init, just populating array with some specific numbers
        population[D*id]=0;
        population[D*id+N]=0;
        population[D*id +2*N]=0;

        population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
        population[D*id+N+1]=0;
        population[D*id +2*N+1]=0;

        for(int i=2; i<N; i++){
            float min= -4 - 1/4*abs((int)((i-4)/3));
            float max= 4 + 1/4*abs((int)((i-4)/3));
            if(i==2)
            {
                population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
                population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+2]=0;
            }
            else
            {
                population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
            }
        }

        //eval
        float e=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
                e += lennardJones(a,b);
            }
        }
        population[D*id + D-1]=e;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id<NP)
    {
        int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
        while(a == id){a = rndInt(globalState, id, NP);}
        while(b == a && b==id){b=rndInt(globalState, id, NP);}
        while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
        mutation[3*id+0]=a;
        mutation[3*id+1]=b;
        mutation[3*id+2]=c;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;  
    if(id<NP)
    {
        int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];

        //DE mutation and crossover
        int j=rndInt(globalState, id, NP);
        for(int i=0; i<D-1; i++)
        {
            //DE mutation
            pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
            //DE crossover
            if(Cr > rndFloat(globalState, id) && i!= j)
                pop[D*id+i]=population[D*id +i];
        }
        // Eval
        pop[D*id+D-1]=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
                pop[D*id+D-1] += lennardJones(a,b);
            }
        }

        __syncthreads();
        //DE selection
        if(pop[D*id+D-1] < population[D*id +D-1])
        {
            for(int i=0; i<D; i++)
                population[D*id +i]=pop[D*id+i];
        }
    }
}

void getBestScore(float *hpopulation)
{
    int max=0;
    for(int i=1; i<hNP; i++)
    {
        if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
            max=i;
    }
    for(int j=0; j<hN; j++)
        cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
    cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}

int main()
{
    cudaEvent_t start,stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));  
    HANDLE_ERROR(cudaEventRecord(start,0));

    int device, st=100;
    float hCr=0.6f, hF=0.8f;
    cudaDeviceProp prop;
    HANDLE_ERROR(cudaGetDevice(&device));
    HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
//  int SN = prop.maxThreadsPerBlock; //512 threads per block
    //int SB = (hNP+(SN-1))/SN;


    //constants NP, D, N, Cr, F
    HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
    HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));

    //seeds
    curandState* devStates;
    HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
    setup_kernel <<< 1, hNP>>> (devStates, 50);

    //population
    float *population, *pop;
    float hpopulation[hNP*hD];
    HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
    HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));

    //mutation
    int *mutation, *mutation1;
    int *hmutation;
    HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
    HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));

    //stream
    cudaStream_t stream_i, stream_j;
    HANDLE_ERROR(cudaStreamCreate(&stream_i));
    HANDLE_ERROR(cudaStreamCreate(&stream_j));

    kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
    kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);


    while(st != 0)
    {
        /*** COPYING MUTATION INDICES***/
        HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
        HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));

        /**** CALLING KERNELS****/
        kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
        kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);

        st--;

        //HANDLE_ERROR(cudaStreamSynchronize(stream_i));
        //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
        //getBestScore(hpopulation);
        //cin.get();
    }

    HANDLE_ERROR(cudaStreamSynchronize(stream_i));
    HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
    getBestScore(hpopulation);

    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    float time;
    HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
    cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;

    HANDLE_ERROR(cudaEventDestroy(start));
    HANDLE_ERROR(cudaEventDestroy(stop));
    HANDLE_ERROR(cudaStreamDestroy(stream_i));
    HANDLE_ERROR(cudaStreamDestroy(stream_j));
    HANDLE_ERROR(cudaFree(population));
    HANDLE_ERROR(cudaFree(pop));
    HANDLE_ERROR(cudaFreeHost(hmutation));
    HANDLE_ERROR(cudaFree(mutation1));
    HANDLE_ERROR(cudaFree(devStates));

    system("pause");
    return 0;
}

CUDA DE kernel not launching

1 Answers1