-1

I'm working on a cuda program to process a 2D image.

The problem is when I try to access blockDim.x and blockId.x, the kernel always failed to launch and output unknown error.

Besides, if I use a 3x5 image, I can access the threadId.x, while I use a 2048x2048 image I can't.

My kernel code runs OK when I use PyCuda, but now I have to switch to cuda C.

I think the problem may be related to

  • the way I pass the array pointer and there's something wrong with cudaMalloc
  • the configuration with my block size and grid size( but the same configuration works well in PyCuda so I don't know how to correct it).

And I use cuda-memcheck, I got unknown error 30 and I googled for solutions but no helpful information.

__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug)
{
    int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
    debug[idx] = threadIdx.x; // debug variable is used for debugging
}

int main(int arg, char* args[])
{
    // ...
    int size = w*h; // w is image width and h is image height
    unsigned char *in = 0;
    unsigned char *out = 0;
    int* debug = 0;

    // Allocate GPU buffers for the images
    cudaMalloc((void**)&in, size * sizeof(unsigned char));
    cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
    cudaMalloc((void**)&debug, size * sizeof(int));

    // Copy image data from host memory to GPU buffers.
    cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char),cudaMemcpyHostToDevice);

    dim3 b_dim(BLOCK_SIZE, 1, 1);   // (1024, 1, 1)
    dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1);    // (4097, 1, 1)
    extractor<<<g_dim, b_dim>>>(in, out, debug);

    // clean up code and processing result
}

Now I can't get expected index so I can't do processing in the kernel, what can be the problem?


EDIT

I want to use 1D index, which means I assume the image array is a "flattened" 1D array and do indexing.


EDIT

After I added the thread check, there's still something wrong.

__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug)
{
    int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
    int y; int x;
    int temp_x; int temp_y; int temp_idx;
    int check = width*height;
    if (idx < check) {
        debug[0] = 1;    // get kernel launch failed "unknown error"
    }
}

I've tried to put the debug[0]=1; expression both in the thread check block and out the block, both of them get the same error.

So I doubt the memalloc is not been done correctly?

BTW, I used nvprof and it said

=22344== Warning: Found 2 invalid records in the result.
==22344== Warning: This can happen if device ran out of memory or if a device kernel was stopped due to an assertion.

EDIT

complete code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <cmath>
#include <iostream>

#include "PNG.h"

#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846

using namespace std;

__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
    int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
    int y; int x;
    int temp_x; int temp_y; int temp_idx;
    int check = width*height;

       if (idx < check) {
        debug[idx] = threadIdx.x;
        y = idx/width;
        x = idx%width;
            if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
                // need padding
                for (int i = 0; i < num_sample; ++i){
                    temp_x = x + disX[i];
                    temp_y = y + disY[i];

                    if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
                    temp_idx = temp_y*width + temp_x;   // sampled index
                    out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
                }
                }
            } else {
                for (int i = 0; i < num_sample; ++i)
                {
                    temp_x = x + disX[i];
                    temp_y = y + disY[i];
                    temp_idx = temp_y*width + temp_x;   // sampled index
                    out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
                }
            }
       }
  }

vector<int> getCirclePos() {
    int r = 0;
    vector <int> circlePos;
    while (!(r>(L/2))) {
        circlePos.push_back(r);
        if (r < R_IN) r += INC1;
        else r += INC2;
    }
    cout << "circlePos:" << endl;
    for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
    {cout << *i << ' ';}
    cout << endl;
    return circlePos;
}

int main(int arg, char* args[])
{
    cudaError_t cudaStatus;
    vector<int> circlePos = getCirclePos();

    // get disX, disY
    int num_sample_per_point = circlePos.size() * N_P;
    int* disX = new int[num_sample_per_point];
    int* disY = new int[num_sample_per_point];
    int r; int cnt = 0;
    for (int i = 0; i < circlePos.size(); ++i)
    {
        r = circlePos[i];
        float angle;
        for (int j = 0; j < N_P; ++j)
        {
            angle = j*360.0/N_P;
            disX[cnt] = r*cos(angle*M_PI/180.0);
            disY[cnt] = r*sin(angle*M_PI/180.0);
            // cout nvpro   << disX[cnt] << "|" << disY[cnt]<< endl;

            cnt++;
        }
    }

    PNG inPng("test.png");
    // PNG outPng;
    // outPng.Create(inPng.w, inPng.h);

    //store width and height so we can use them for our output image later
    const unsigned int w = inPng.w;
    const unsigned int h = inPng.h;
    cout << "w: " << w << " h: " << h << endl;
    //4 because there are 4 color channels R, G, B, and A
    int size = w * h;

    unsigned char *in = 0;
    unsigned char *out = 0;
    int* debug = 0;

    // Allocate GPU buffers for the images
    cudaMalloc((void**)&in, size * sizeof(unsigned char));
    cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
    cudaMalloc((void**)&debug, size * sizeof(int));


    vector<unsigned char> img_data;
    for (int i = 0; i < size; ++i)
    {
        img_data.push_back(inPng.data[i*4]);
    }

    // debug
    cout << "========= img_data ==========" << endl;
    for (int i = 0; i < size; ++i)
    {
        cout << int(img_data[i]) << "," ;
    }
    cout << endl;

    // Copy image data from host memory to GPU buffers.
    cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);

    //free the input image because we do not need it anymore
    inPng.Free();

    // Launch a kernel on the GPU with one thread for each element.
    dim3 b_dim(BLOCK_SIZE, 1, 1);   // (1024, 1, 1)
    dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1);    // (4097, 1, 1)
    int pad = L/2;

    // __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
    extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess)
    {
        std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
        cudaFree(in);
        cudaFree(out);
        cudaFree(debug);
        exit(1);
    }

    auto tmp = new unsigned char[size*num_sample_per_point];
    auto tmp_debug = new int [size];

    cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);

    cout << "========= out =========" << endl;
    for (int i = 0; i < size*num_sample_per_point; ++i)
    {
        cout << int(tmp[i]) << ", ";
    }
    cout << endl;

    cout << "========debug=======" << endl;
    for (int i = 0; i < size; ++i)
    {
        cout << tmp_debug[i] << ", ";
    }
    cout << endl;

    cudaFree(in);
    cudaFree(out);
    cudaFree(debug);

    delete[] tmp; delete[] tmp_debug;

    return 0;
}
talonmies
  • 70,661
  • 34
  • 192
  • 269
jinglei
  • 3,269
  • 11
  • 27
  • 46

1 Answers1

3

This (according to your comment) is defining 1024 threads per block:

dim3 b_dim(BLOCK_SIZE, 1, 1);   // (1024, 1, 1)

According to your question text, w and h are each 2048 in the failing case, so this:

dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1);    // (4097, 1, 1)

is creating 4097 blocks, just as you indicate in your comment.

4097 blocks of 1024 threads each is 4195328 threads total, but your allocation sizes are only providing 2048*2048 elements, or 4194304 elements total. So you are launching 4195328 threads with only 4194304 elements, leaving 1024 threads left over.

So what do those 1024 extra threads do? They still run the kernel code and attempt to access your debug array beyond the end of the allocated space.

This results in undefined behavior in C and in C++.

The customary method to fix this is to pass the problem size to your kernel and add a "thread check" in your kernel code, like this:

__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int n)
{
    int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
    if (idx < n)
      debug[idx] = threadIdx.x; // debug variable is used for debugging
}

which prevents the "extra" threads from doing anything.

If you search here on the cuda tag for "thread check" you will find many other examples of questions like this.

As an example, based on the code pieces you have shown, the following runs without error for me:

$ cat t147.cu
const int width = 2048;
const int height = 2048;
const int BLOCK_SIZE = 1024;
__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug)
{
    int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
//    int y; int x;
//    int temp_x; int temp_y; int temp_idx;
    int check = width*height;
    if (idx < check) {
        debug[idx] = 1;    // get kernel launch failed "unknown error"
    }
}
int main(int arg, char* args[])
{

    const int w = width;
    const int h = height;
    const int num_sample_per_point = 1;
    int size = w*h; // w is image width and h is image height
    unsigned char *in = 0;
    unsigned char *out = 0;
    int* debug = 0;

    // Allocate GPU buffers for the images
    cudaMalloc((void**)&in, size * sizeof(unsigned char));
    cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
    cudaMalloc((void**)&debug, size * sizeof(int));

    // Copy image data from host memory to GPU buffers.
//    cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char),cudaMemcpyHostToDevice);

    dim3 b_dim(BLOCK_SIZE, 1, 1);   // (1024, 1, 1)
    dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1);    // (4097, 1, 1)
    extractor<<<g_dim, b_dim>>>(in, out, debug);
    cudaDeviceSynchronize();
}
$ nvcc -arch=sm_61 -o t147 t147.cu
$ cuda-memcheck ./t147
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

In your complete code, you simply have an illegal access problem in your kernel. I've modified it to remove the dependency on PNG, and if we omit the kernel code other than the debug setting, it runs fine. However if we include your kernel code, and run with cuda-memcheck we get all sorts of out-of-bounds accesses. In the future, you could use the method described here to debug these:

$ cat t146.cu
#include <cmath>
#include <iostream>
#include <vector>

#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846

using namespace std;

__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
    int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
    int y; int x;
    int temp_x; int temp_y; int temp_idx;
    int check = width*height;

       if (idx < check) {
        debug[idx] = threadIdx.x;
        y = idx/width;
        x = idx%width;
#ifdef  FAIL
            if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
                // need padding
                for (int i = 0; i < num_sample; ++i){
                    temp_x = x + disX[i];
                    temp_y = y + disY[i];

                    if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
                    temp_idx = temp_y*width + temp_x;   // sampled index
                    out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
                }
                }
            } else {
                for (int i = 0; i < num_sample; ++i)
                {
                    temp_x = x + disX[i];
                    temp_y = y + disY[i];
                    temp_idx = temp_y*width + temp_x;   // sampled index
                    out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
                }
            }
#endif
       }
  }

vector<int> getCirclePos() {
    int r = 0;
    vector <int> circlePos;
    while (!(r>(L/2))) {
        circlePos.push_back(r);
        if (r < R_IN) r += INC1;
        else r += INC2;
    }
    cout << "circlePos:" << endl;
    for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
    {//cout << *i << ' ';
      }
    cout << endl;
    return circlePos;
}

int main(int arg, char* args[])
{
    cudaError_t cudaStatus;
    vector<int> circlePos = getCirclePos();

    // get disX, disY
    int num_sample_per_point = circlePos.size() * N_P;
    int* disX = new int[num_sample_per_point];
    int* disY = new int[num_sample_per_point];
    int r; int cnt = 0;
    for (int i = 0; i < circlePos.size(); ++i)
    {
        r = circlePos[i];
        float angle;
        for (int j = 0; j < N_P; ++j)
        {
            angle = j*360.0/N_P;
            disX[cnt] = r*cos(angle*M_PI/180.0);
            disY[cnt] = r*sin(angle*M_PI/180.0);
            // cout nvpro   << disX[cnt] << "|" << disY[cnt]<< endl;

            cnt++;
        }
    }

    const unsigned int w = 2048;
    const unsigned int h = 2048;
    cout << "w: " << w << " h: " << h << endl;
    //4 because there are 4 color channels R, G, B, and A
    int size = w * h;

    unsigned char *in = 0;
    unsigned char *out = 0;
    int* debug = 0;

    // Allocate GPU buffers for the images
    cudaMalloc((void**)&in, size * sizeof(unsigned char));
    cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
    cudaMalloc((void**)&debug, size * sizeof(int));


    vector<unsigned char> img_data;
    for (int i = 0; i < size; ++i)
    {
        img_data.push_back(0);
    }

    // debug
    cout << "========= img_data ==========" << endl;
    for (int i = 0; i < size; ++i)
    {
 //       cout << int(img_data[i]) << "," ;
    }
    cout << endl;

    // Copy image data from host memory to GPU buffers.
    cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);


    // Launch a kernel on the GPU with one thread for each element.
    dim3 b_dim(BLOCK_SIZE, 1, 1);   // (1024, 1, 1)
    dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1);    // (4097, 1, 1)
    int pad = L/2;

    // __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
    extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess)
    {
        std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
        cudaFree(in);
        cudaFree(out);
        cudaFree(debug);
        exit(1);
    }

    auto tmp = new unsigned char[size*num_sample_per_point];
    auto tmp_debug = new int [size];

    cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);

    cout << "========= out =========" << endl;
    for (int i = 0; i < size*num_sample_per_point; ++i)
    {
   //     cout << int(tmp[i]) << ", ";
    }
    cout << endl;

    cout << "========debug=======" << endl;
    for (int i = 0; i < size; ++i)
    {
     //   cout << tmp_debug[i] << ", ";
    }
    cout << endl;

    cudaFree(in);
    cudaFree(out);
    cudaFree(debug);

    delete[] tmp; delete[] tmp_debug;

    return 0;
}
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo
t146.cu(18): warning: variable "y" was set but never used

t146.cu(18): warning: variable "x" was set but never used

t146.cu(19): warning: variable "temp_x" was declared but never referenced

t146.cu(19): warning: variable "temp_y" was declared but never referenced

t146.cu(19): warning: variable "temp_idx" was declared but never referenced

t146.cu(18): warning: variable "y" was set but never used

t146.cu(18): warning: variable "x" was set but never used

t146.cu(19): warning: variable "temp_x" was declared but never referenced

t146.cu(19): warning: variable "temp_y" was declared but never referenced

t146.cu(19): warning: variable "temp_idx" was declared but never referenced

$ cuda-memcheck ./t146
========= CUDA-MEMCHECK
circlePos:

w: 2048 h: 2048
========= img_data ==========

========= out =========

========debug=======

========= ERROR SUMMARY: 0 errors
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo -DFAIL
$ cuda-memcheck ./t146
...
========= Invalid __global__ read of size 4
=========     at 0x00000418 in /home/ubuntu/bobc/misc/t146.cu:41:extractor(unsigned char const *, unsigned char*, int*, int*, int*, int, int, int, int)
=========     by thread (197,0,0) in block (17,0,0)
=========     Address 0x00c8b290 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) 
...
(and much more output like this)

The above output points to line 41 in the code, which is reading from disX.

As it turns out, your disX is a host-allocated variable:

int* disX = new int[num_sample_per_point];

but you are attempting to pass it to device code:

extractor<<<g_dim, b_dim>>>(in, out, debug, disX, disY, w, h, pad, num_sample_per_point);
                                            ^^^^

That is just completely broken. You can't do that in CUDA. You need to make a device copy of that variable, and also disY When I fix that problem, the modified code runs without error for me:

$ cat t146.cu
#include <cmath>
#include <iostream>
#include <vector>

#define L 3
#define INC1 1
#define INC2 1
#define R_IN 2
#define N_P 4
#define BLOCK_SIZE 1024
#define PI 3.14159265358979323846

using namespace std;

__global__ void extractor(const unsigned char* in, unsigned char* out, int* debug, int* disX, int* disY, int width, int height, int pad, int num_sample)
{
    int idx = (threadIdx.x) + blockDim.x * blockIdx.x ;
    int y; int x;
    int temp_x; int temp_y; int temp_idx;
    int check = width*height;

       if (idx < check) {
        debug[idx] = threadIdx.x;
        y = idx/width;
        x = idx%width;
#ifdef  FAIL
            if ((x < pad) || (x >= (width-pad)) || (y < pad) || (y >= (height-pad))) {
                // need padding
                for (int i = 0; i < num_sample; ++i){
                    temp_x = x + disX[i];
                    temp_y = y + disY[i];

                    if (!((temp_x < 0)||(temp_x > (width-1)) || (temp_y < 0) ||(temp_y>(height-1)))) {
                    temp_idx = temp_y*width + temp_x;   // sampled index
                    out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
                }
                }
            } else {
                for (int i = 0; i < num_sample; ++i)
                {
                    temp_x = x + disX[i];
                    temp_y = y + disY[i];
                    temp_idx = temp_y*width + temp_x;   // sampled index
                    out[(idx*num_sample)+i] = in[temp_idx]; // copy sampled value to result
                }
            }
#endif
       }
  }

vector<int> getCirclePos() {
    int r = 0;
    vector <int> circlePos;
    while (!(r>(L/2))) {
        circlePos.push_back(r);
        if (r < R_IN) r += INC1;
        else r += INC2;
    }
    cout << "circlePos:" << endl;
    for (auto i = circlePos.begin(); i != circlePos.end(); ++i)
    {//cout << *i << ' ';
      }
    cout << endl;
    return circlePos;
}

int main(int arg, char* args[])
{
    cudaError_t cudaStatus;
    vector<int> circlePos = getCirclePos();

    // get disX, disY
    int num_sample_per_point = circlePos.size() * N_P;
    int* disX = new int[num_sample_per_point];
    int* disY = new int[num_sample_per_point];
    int r; int cnt = 0;
    for (int i = 0; i < circlePos.size(); ++i)
    {
        r = circlePos[i];
        float angle;
        for (int j = 0; j < N_P; ++j)
        {
            angle = j*360.0/N_P;
            disX[cnt] = r*cos(angle*M_PI/180.0);
            disY[cnt] = r*sin(angle*M_PI/180.0);
            // cout nvpro   << disX[cnt] << "|" << disY[cnt]<< endl;

            cnt++;
        }
    }

    int *d_disX, *d_disY;
    cudaMalloc(&d_disX, num_sample_per_point*sizeof(int));
    cudaMalloc(&d_disY, num_sample_per_point*sizeof(int));
    cudaMemcpy(d_disX, disX, num_sample_per_point*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_disY, disY, num_sample_per_point*sizeof(int), cudaMemcpyHostToDevice);
    const unsigned int w = 2048;
    const unsigned int h = 2048;
    cout << "w: " << w << " h: " << h << endl;
    //4 because there are 4 color channels R, G, B, and A
    int size = w * h;

    unsigned char *in = 0;
    unsigned char *out = 0;
    int* debug = 0;

    // Allocate GPU buffers for the images
    cudaMalloc((void**)&in, size * sizeof(unsigned char));
    cudaMalloc((void**)&out, num_sample_per_point * size * sizeof(unsigned char));
    cudaMalloc((void**)&debug, size * sizeof(int));


    vector<unsigned char> img_data;
    for (int i = 0; i < size; ++i)
    {
        img_data.push_back(0);
    }

    // debug
    cout << "========= img_data ==========" << endl;
    for (int i = 0; i < size; ++i)
    {
 //       cout << int(img_data[i]) << "," ;
    }
    cout << endl;

    // Copy image data from host memory to GPU buffers.
    cudaMemcpy(in, &img_data[0], size * sizeof(unsigned char), cudaMemcpyHostToDevice);


    // Launch a kernel on the GPU with one thread for each element.
    dim3 b_dim(BLOCK_SIZE, 1, 1);   // (1024, 1, 1)
    dim3 g_dim(int(w*h/BLOCK_SIZE)+1, 1, 1);    // (4097, 1, 1)
    int pad = L/2;

    // __global__ void extractor(const unsigned char* in, unsigned char* out, vector<int> disX, vector<int> disY, int width, int height, int pad, int num_sample)
    extractor<<<g_dim, b_dim>>>(in, out, debug, d_disX, d_disY, w, h, pad, num_sample_per_point);

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess)
    {
        std::cout << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
        cudaFree(in);
        cudaFree(out);
        cudaFree(debug);
        exit(1);
    }

    auto tmp = new unsigned char[size*num_sample_per_point];
    auto tmp_debug = new int [size];

    cudaMemcpy(tmp_debug, debug, size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(tmp, out, num_sample_per_point * size * sizeof(unsigned char), cudaMemcpyDeviceToHost);

    cout << "========= out =========" << endl;
    for (int i = 0; i < size*num_sample_per_point; ++i)
    {
   //     cout << int(tmp[i]) << ", ";
    }
    cout << endl;

    cout << "========debug=======" << endl;
    for (int i = 0; i < size; ++i)
    {
     //   cout << tmp_debug[i] << ", ";
    }
    cout << endl;

    cudaFree(in);
    cudaFree(out);
    cudaFree(debug);

    delete[] tmp; delete[] tmp_debug;

    return 0;
}
$ nvcc -std=c++11 -o t146 t146.cu -arch=sm_61 -lineinfo -DFAIL
$ cuda-memcheck ./t146
========= CUDA-MEMCHECK
circlePos:

w: 2048 h: 2048
========= img_data ==========

========= out =========

========debug=======

========= ERROR SUMMARY: 0 errors
$
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • Hi, thanks for the reply. I've changed the code as you said, but it seems that even if I do the thread check, I can't access the `debug` variable in the kernel. Please see my edit. – jinglei Jun 04 '17 at 05:31
  • You'll need to provide a [mcve] I have no idea what `width` and `height` are. It should be a *complete* code that I can copy and paste and compile, without having to add anything or change anything. – Robert Crovella Jun 04 '17 at 08:57
  • Added to question details. Thanks in advance. – jinglei Jun 04 '17 at 09:00
  • I can't compile that. I don't have PNG.h And anyway, you are supposed to provide a minimal code. – Robert Crovella Jun 04 '17 at 09:13
  • I've added a fully worked example based on the code pieces you originally indicated (which I can compile) indicating that the code runs without error. If you have problems in your much larger code, then you'll need to do some isolation. – Robert Crovella Jun 04 '17 at 09:21
  • 1
    You're passing `disX` and `disY` which are host pointers, to device code. That is a fundamental error in CUDA. – Robert Crovella Jun 04 '17 at 09:44