I'm working on a cuda program to process a 2D image.
The problem is when I try to access blockDim.x
and blockId.x
, the kernel always failed to launch and output unknown error
.
Besides, if I use a 3x5
image, I can access the threadId.x
, while I use a 2048x2048
image I can't.
My kernel code runs OK when I use PyCuda
, but now I have to switch to cuda C
.
I think the problem may be related to
- the way I pass the array pointer and there's something wrong with
cudaMalloc
- the configuration with my block size and grid size( but the same configuration works well in
PyCuda
so I don't know how to correct it).
And I use cuda-memcheck
, I got unknown error 30
and I googled for solutions but no helpful information.
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
debug[idx] = threadIdx.x; // debug variable is used for debugging
}
int main(int arg, char* args[])
{
// ...
int size = w*h; // w is image width and h is image height
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char),cudaMemcpyHostToDevice);
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
extractor<<<g_dim, b_dim>>>(in, out, debug);
// clean up code and processing result
}
Now I can't get expected index so I can't do processing in the kernel, what can be the problem?
EDIT
I want to use 1D index, which means I assume the image array is a "flattened" 1D array and do indexing.
EDIT
After I added the thread check, there's still something wrong.
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[0] = 1; // get kernel launch failed "unknown error"
}
}
I've tried to put the debug[0]=1;
expression both in the thread check block and out the block, both of them get the same error.
So I doubt the memalloc is not been done correctly?
BTW, I used nvprof
and it said
=22344== Warning: Found 2 invalid records in the result.
==22344== Warning: This can happen if device ran out of memory or if a device kernel was stopped due to an assertion.
EDIT
complete code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cmath>
#include <iostream>
#include "PNG.h"
#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846
using namespace std;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
int y; int x;
int temp_x; int temp_y; int temp_idx;
int check = width*height;
if (idx < check) {
debug[idx] = threadIdx.x;
y = idx/width;
x = idx%width;
if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
// need padding
for (int i = 0; i < num_sample; ++i){
temp_x = x + disX[i];
temp_y = y + disY[i];
if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
} else {
for (int i = 0; i < num_sample; ++i)
{
temp_x = x + disX[i];
temp_y = y + disY[i];
temp_idx = temp_y*width + temp_x; // sampled index
out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
}
}
}
}
vector<int> getCirclePos() {
int r = 0;
vector <int> circlePos;
while (!(r>(L/2))) {
circlePos.push_back(r);
if (r < R_IN) r += INC1;
else r += INC2;
}
cout << "circlePos:" << endl;
for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
{cout << *i << ' ';}
cout << endl;
return circlePos;
}
int main(int arg, char* args[])
{
cudaError_t cudaStatus;
vector<int> circlePos = getCirclePos();
// get disX, disY
int num_sample_per_point = circlePos.size() * N_P;
int* disX = new int[num_sample_per_point];
int* disY = new int[num_sample_per_point];
int r; int cnt = 0;
for (int i = 0; i < circlePos.size(); ++i)
{
r = circlePos[i];
float angle;
for (int j = 0; j < N_P; ++j)
{
angle = j*360.0/N_P;
disX[cnt] = r*cos(angle*M_PI/180.0);
disY[cnt] = r*sin(angle*M_PI/180.0);
// cout nvpro << disX[cnt] << "|" << disY[cnt]<< endl;
cnt++;
}
}
PNG inPng("test.png");
// PNG outPng;
// outPng.Create(inPng.w, inPng.h);
//store width and height so we can use them for our output image later
const unsigned int w = inPng.w;
const unsigned int h = inPng.h;
cout << "w: " << w << " h: " << h << endl;
//4 because there are 4 color channels R, G, B, and A
int size = w * h;
unsigned char *in = 0;
unsigned char *out = 0;
int* debug = 0;
// Allocate GPU buffers for the images
cudaMalloc((void**)&in, size * sizeof(unsigned char));
cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
cudaMalloc((void**)&debug, size * sizeof(int));
vector<unsigned char> img_data;
for (int i = 0; i < size; ++i)
{
img_data.push_back(inPng.data[i*4]);
}
// debug
cout << "========= img_data ==========" << endl;
for (int i = 0; i < size; ++i)
{
cout << int(img_data[i]) << "," ;
}
cout << endl;
// Copy image data from host memory to GPU buffers.
cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);
//free the input image because we do not need it anymore
inPng.Free();
// Launch a kernel on the GPU with one thread for each element.
dim3 b_dim(BLOCK_SIZE, 1, 1); // (1024, 1, 1)
dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1); // (4097, 1, 1)
int pad = L/2;
// __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
exit(1);
}
auto tmp = new unsigned char[size*num_sample_per_point];
auto tmp_debug = new int [size];
cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cout << "========= out =========" << endl;
for (int i = 0; i < size*num_sample_per_point; ++i)
{
cout << int(tmp[i]) << ", ";
}
cout << endl;
cout << "========debug=======" << endl;
for (int i = 0; i < size; ++i)
{
cout << tmp_debug[i] << ", ";
}
cout << endl;
cudaFree(in);
cudaFree(out);
cudaFree(debug);
delete[] tmp; delete[] tmp_debug;
return 0;
}