2

I have this cuda file:

#include "cuda.h"
#include "../../HandleError.h"
#include "Sphere.hpp"
#include <stdlib.h>
#include <CImg.h>

#define WIDTH 1280
#define HEIGHT 720
#define rnd(x) (x*rand()/RAND_MAX)
#define SPHERES_COUNT 5

using namespace cimg_library;

__global__
void kernel(unsigned char* bitmap, Sphere* s)
{
   // Map threadIdx/blockIdx to pixel position
   int x = threadIdx.x + blockIdx.x * blockDim.x;
   int y = threadIdx.y + blockIdx.y * blockDim.y;
   int offset = x + y * blockDim.x * gridDim.x;
   float ox = x - blockDim.x * gridDim.x / 2;
   float oy = y - blockDim.y * gridDim.y / 2;
   float r = 0.2, g = 0.2, b = 0.5;
   float maxz = -INF;
   for (int i = 0; i < SPHERES_COUNT; i++) {
       float n, t = s[i].hit(ox, oy, &n);
       if (t > maxz) {
           float fscale = n;
           r = s[i].r * fscale;
           g = s[i].g * fscale;
           b = s[i].b * fscale;
           maxz = t;
       }
   }

   bitmap[offset*3] = (int)(r * 255);
   bitmap[offset*3 + 1] = (int)(g * 255);
   bitmap[offset*3 + 2] = (int)(b * 255);
}

__constant__ Sphere s[SPHERES_COUNT];

int main ()
{
    //Capture start time
    cudaEvent_t start, stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));
    HANDLE_ERROR(cudaEventRecord(start, 0));

    //Create host bitmap
    CImg<unsigned char> image(WIDTH, HEIGHT, 1, 3);
    image.permute_axes("cxyz");

    //Allocate device bitmap data
    unsigned char* dev_bitmap;
    HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, image.size()*sizeof(unsigned char)));

    //Generate spheres and copy them on the GPU one by one
    Sphere* temp_s = (Sphere*)malloc(SPHERES_COUNT*sizeof(Sphere));
    for (int i=0; i <SPHERES_COUNT; i++) {
        temp_s[i].r = rnd(1.0f);
        temp_s[i].g = rnd(1.0f);
        temp_s[i].b = rnd(1.0f);
        temp_s[i].x = rnd(1000.0f) - 500;
        temp_s[i].y = rnd(1000.0f) - 500;
        temp_s[i].z = rnd(1000.0f) - 500;
        temp_s[i].radius = rnd(100.0f) + 20;
    }

    HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere)*SPHERES_COUNT));
    free(temp_s);

    //Generate a bitmap from spere data
    dim3 grids(WIDTH/16, HEIGHT/16);
    dim3 threads(16, 16);
    kernel<<<grids, threads>>>(dev_bitmap, s);

    //Copy the bitmap back from the GPU for display
    HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
                            image.size()*sizeof(unsigned char),
                            cudaMemcpyDeviceToHost));

    cudaFree(dev_bitmap);

    image.permute_axes("yzcx");
    image.save("render.bmp");
}

It compiles fine, but when executed I get this error:

an illegal memory access was encountered in main.cu at line 82

that is, here:

    //Copy the bitmap back from the GPU for display
    HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
                            image.size()*sizeof(unsigned char),
                            cudaMemcpyDeviceToHost));

I cannot understand why... I know that If remove this:

  bitmap[offset*3] = (int)(r * 255);
  bitmap[offset*3 + 1] = (int)(g * 255);
  bitmap[offset*3 + 2] = (int)(b * 255);

The error is not reported, so I thought It may be an out of index error, reported later, but I have An identical version of this program that makes no use of constant memory, and it works fine with the very same version of the kernel function...

manu34
  • 75
  • 1
  • 1
  • 11
  • 2
    Nobody can run your code. You are supposed to provide an [MCVE](http://stackoverflow.com/help/mcve). Follow the process described [here](http://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy/27278218?s=1|0.3173#27278218) and you may be able to figure it out yourself. – Robert Crovella May 12 '15 at 10:42
  • It is illegal to pass `s` to the kernel in the way your code does (it also makes no sense to do so). That is the source of the problem, I suspect – talonmies May 12 '15 at 13:27
  • @talonmies you were right, the problem is gone if I just use 's' as a global variable, but why can't I pass it as a parameter? Does constant memory reside in an other address-space? BTW this was a conversion of a similar algorithm but that makes no use of constant memory, and s was a parameter, so I forgot to remove it. – manu34 May 12 '15 at 15:18

1 Answers1

4

There are two things at issue here. The first is this:

__constant__ Sphere s[SPHERES_COUNT];

int main ()
{
    ......

    kernel<<<grids, threads>>>(dev_bitmap, s);

    ......

In host code, s is a host memory variable which provides a handle for the CUDA runtime to hook up with the device constant memory symbol. It doesn't contain a valid device pointer and can't be passed to kernel calls. The result is a invalid memory access error.

You could do this:

__constant__ Sphere s[SPHERES_COUNT];

int main ()
{
    ......

    Sphere *d_s;
    cudaGetSymbolAddress((void **)&d_s, s);
    kernel<<<grids, threads>>>(dev_bitmap, d_s);

    ......

which would cause a symbol lookup to get the device address of s, and it would be valid to pass that to the kernel. However, the GPU relies on the compiler emitting specific instructions to access memory through the constant cache. The device compiler will only emit these instructions when it can detect that a __constant__ variable is being accessed within a kernel, which is not possible when using a pointer. You can see more about how the compiler will generate code for constant variable access in this Stack Overflow question and answer.

Community
  • 1
  • 1
talonmies
  • 70,661
  • 34
  • 192
  • 269