I use a function to calculate three vector's vector dot, and use a reduce to make it faster. However,I always got an error like this:
CUDA error at kernel.cu:120 code=30(cudaErrorUnknown) "cudaMemcpy(partia
l_c, dev_partial_c,sizeofblock,cudaMemcpyDeviceToHost )"
I cannot figure out why because the code seems normal.And the allocate function didn't return error.Are there any possible solutions? Thank you a lot.
double vector_dot(double* d_A,double* d_B,double *d_C,int numElements)
{
int size = sizeof(double) * numElements;
int c_size = sizeof(char) * numElements;
double *d_D=NULL;
checkCudaErrors(cudaMalloc((void**)&d_D,size));
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorMPL<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_D, numElements);
double *partial_c;
double*dev_partial_c;
int sizeofblock=blocksPerGrid*sizeof(double);
partial_c = (double*)malloc(sizeofblock);
checkCudaErrors( cudaMalloc( (void**)&dev_partial_c,sizeofblock )) ;
vector_dot_h<<<blocksPerGrid, threadsPerBlock>>>(d_D, d_C, dev_partial_c, numElements);
double sum = 0;
checkCudaErrors(cudaMemcpy(partial_c,dev_partial_c,sizeofblock,cudaMemcpyDeviceToHost));
for (int i=0; i<blocksPerGrid; i++) {
sum += partial_c[i];
}
checkCudaErrors(cudaFree(d_D));
checkCudaErrors(cudaFree(dev_partial_c));
free(partial_c);
// Reset the device and exit
checkCudaErrors(cudaDeviceReset());
return sum;
}
If I delete this ,I will receive the unknown error in cudafree call.It seems all the cuda API call cannot be finished.I am wondering why? What is a cuda error unknown?What is the cause?