I coded a little program "Finding Prime Numbers" as my practice. After I tried to set more blocks successfully in GPU. I encountered another problem that I cannot understand. I found that the result cannot send back to the host when I input that I would like the program to count all the prime numbers which are less than 200,000. It showed memory failed message. According to the comments and some materials, I think I met Windows TDR event. My Screen flashed and the message "Display driver stopped response and recovered" came out when I run this program. What should I solve this problem?
My code is as below:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <vector>
using namespace std;
cudaError_t findPrimeWithCuda(bool *c, int *a, unsigned int size);
__host__ __device__ bool checkPrime(int i)
{
for (int m = 2; m <= i - 1; m++)
{
if (i%m == 0) return true;
}
return false;
}
__global__ void getPrimeKernel(bool *c, int *a, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= size) return;
c[i] = checkPrime(a[i]);
}
void cudaGetPrime(int i)
{
i = i - 3;
int *arr = (int *)malloc((size_t)(i * sizeof(int)));
bool *rst = (bool *)malloc((size_t)(i * sizeof(bool)));
for (int j = 0; j <= i; j++) arr[j] = j + 3;
cudaError_t cudaStatus = findPrimeWithCuda(rst, arr, i);
if (cudaStatus != cudaSuccess) fprintf(stderr,"findPrimeWithCuda failed!!");
}
void w_CudaArray(int lastNum)
{
time_t t1 = time(NULL);
cudaGetPrime(lastNum);
time_t t2 = time(NULL);
printf("Time spends %d seconds\n", t2 - t1);
}
int main()
{
int lastNum = 0;
cout << "Find all prime numbers less than ? ";
cin >> lastNum;
w_CudaArray(lastNum);
}
cudaError_t findPrimeWithCuda(bool *c, int *a, unsigned int size)
{
int *dev_a = 0;
bool *dev_c = false;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed!!");
goto Error;
}
size_t totalm, freem;
double free_m, total_m, used_m;
cudaMemGetInfo(&freem, &totalm);
free_m = (size_t)freem / 1048576.0;
total_m = (size_t)totalm / 1048576.0;
used_m = total_m - free_m;
cout << "Total memory = " << total_m << " MB" << endl;
cout << "Used memory = " << used_m << " MB" << endl;
cout << "Free memory = " << free_m << " MB" << endl;
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc dev_a failed!!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(bool));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc dev_c failed!!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy dev_a failed!!");
goto Error;
}
cudaDeviceProp myCUDA;
if (cudaGetDeviceProperties(&myCUDA, 0) == cudaSuccess)
{
printf("Using device %d:\n", 0);
printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
myCUDA.name, (int)myCUDA.totalGlobalMem, (int)myCUDA.major,
(int)myCUDA.minor, (int)myCUDA.clockRate);
}
int threadsPerBlock = myCUDA.maxThreadsPerBlock;
int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
cout << "Maxium number per block = " << threadsPerBlock << endl;
cout << "Blocks per Grid = " << blocksPerGrid << endl;
getPrimeKernel<<<blocksPerGrid, threadsPerBlock>>>(dev_c, dev_a, size);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "getPrimeKernel launch failed!!: %s\n",cudaGetErrorString(cudaStatus));
goto Error;
}
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceSynchorinze returned error code %d after launching getPrimeKernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(bool), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemory failed!");
goto Error;
}
int trueNumber = 0;
for (int i = 0; i < size; i++)
{
if (c[i] == false) trueNumber++;
}
cout << "There are " << trueNumber + 2 << " prime numbers!!" << endl;
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset failed!!");
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
return cudaStatus;
}
After the execution of cudaMemcpy function. I got the error message "cudaMemory failed!"
And my hardware information which I inquired by "cudaDeviceProp" is as below.
================================
Total memory = 1024 MB
Used memory = 189.293 MB
Free memory = 834.707 MB
Using device 0:
GeForce GTX 550 Ti; global mem: 1073741824B; compute v2.1; clock: 1800000 kHz
Maxium number per block = 1024
Blocks per Grid = 196
cudaMemory failed!findPrimeWithCuda failed!!