Cuda Writing to an array on the device doesn't change value

Question

I have a 1D float3 pixel array, for testing I'm trying to set an array value on the device. I don't get an error but when I print the array value it says 0.

This is my device code.

__global__ void addKernel(float3 *pixeld_d[])
{
        pixeld_d[threadIdx.x + W *blockIdx.x] = &make_float3(255, 30, 123);
        printf("\n Block %d Thread %d Pixeld_d %d",blockIdx.x,threadIdx.x, pixeld_d[threadIdx.x + W * blockIdx.x]->x);
}

My host code:

        float3* pixeld = new float3[W*H];
        float3** pixeld_d = new float3*[W*H];
        status = cudaMallocManaged((void **)&pixeld_d,(W*H)*sizeof(float3));
        status = cudaMemcpy(pixeld_d,pixeld, (W*H) * sizeof(float3), cudaMemcpyHostToDevice);
        addKernel << <W,H >> > (pixeld_d);

In the console i get results like this:

 Block 811 Thread 25 Pixeld_d 0

I expect Pixeld_d to be 255 but it is 0.

Here the full code(All the commented code is commented because i removed somethings from the function call and vs would give me build errors):

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <SFML/Graphics.hpp>
#include <stdio.h>
#include <iostream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
   }
}

#define W 960
#define H 540
int mov;
#define Vector3 float3
//,Sphere sphere,Sphere light
#pragma region MyRegion




__device__ inline double dot(const Vector3& a, const Vector3& b) {
    return (a.x*b.x + a.y*b.y + a.z*b.z);
}


__device__ struct Sphere
{
    Vector3 c;
    float r;
    Sphere(Vector3 i, float j) { c = i, r = j; }
    Vector3 getNormal(const Vector3& pi) const { return (make_float3(make_float3(pi.x - c.x, pi.y - c.y, pi.z - c.z).x / r, make_float3(pi.x - c.x, pi.y - c.y, pi.z - c.z).y / r, make_float3(pi.x - c.x, pi.y - c.y, pi.z - c.z).z / r)); }

};


__device__ __host__ struct Color
{
    int r, g, b;
    Color(float a, float e, float t) { r = a, g = e, b = t; }
};
#pragma endregion
__global__ void addKernel(float3 *pixeld_d[])
{
        pixeld_d[threadIdx.x + W *blockIdx.x] = &make_float3(255, 30, 123);
        printf("\n Block %d Thread %d Pixeld_d %d",blockIdx.x,threadIdx.x, pixeld_d[threadIdx.x + W * blockIdx.x]->x);
        return;/*
        float3 black = make_float3(0, 0, 0);
        float3 red = make_float3(255, 0, 0);
        float3 white = make_float3(255, 255, 255);
        pixeld_d[threadIdx.y] = &black;
        float3 o = make_float3(blockIdx.x, threadIdx.x, 0);
        float3 d = make_float3(0, 0, 1);
        double t = 20000;
        const Vector3 oc = make_float3(o.x - sphere.c.x, o.y - sphere.c.y, o.z - sphere.c.z);
        const double b = 2 * dot(oc, d);
        const double c = dot(oc, oc) - sphere.r * sphere.r;
        double disc = b * b - 4 * c;
        if (!disc < 1e-4)
        {
            disc = sqrt(disc);
            const double t0 = -b - disc;
            const double t1 = -b + disc;
            t = (t0 < t1) ? t0 : t1;
            Vector3 pi = make_float3(o.x + make_float3(d.x * t,d.y * t, d.z * t).x, o.y + make_float3(d.x * t, d.y * t, d.z * t).y,o.z + make_float3(d.x * t, d.y * t, d.z * t).z);
            Vector3 L = make_float3(light.c.x - pi.x, light.c.y - pi.y, light.c.z - pi.z);
            Vector3 N = make_float3(make_float3(pi.x - sphere.c.x, pi.y - sphere.c.y, pi.z - sphere.c.z).x / sphere.r, make_float3(pi.x - sphere.c.x, pi.y - sphere.c.y, pi.z - sphere.c.z).y / sphere.r, make_float3(pi.x - sphere.c.x, pi.y - sphere.c.y, pi.z - sphere.c.z).z / sphere.r);
            double mg = sqrt(L.x*L.x + L.y * L.y + L.z * L.z);
            float3 Lf = make_float3(L.x / mg, L.y / mg, L.z / mg);
            mg = sqrt(N.x*N.x + N.y * N.y + N.z * N.z);
            float3 Nf = make_float3(N.x / mg, N.y / mg, N.z / mg);
            float dt = dot(Lf,Nf);
            int r = (red.x + white.x * dt)*0.5;
            int g = (red.y + white.y * dt)*0.5;
            int b = (red.z + white.z * dt)*0.5;
            if (r < 0)
                r = 0;
            if (g < 0)
                g = 0;
            if (b < 0)
                b = 0;
            pixeld_d[threadIdx.y]->x = r;
            pixeld_d[threadIdx.y]->y = g;
            pixeld_d[threadIdx.y]->z = b;

        }
*/
}

int main()
{   


    sf::RenderWindow window(sf::VideoMode(W, H), "SFML works!");
    sf::Image image;
    image.create(W, H, sf::Color::Black);
    sf::Texture tex;
    sf::Sprite sprite;

    while (window.isOpen())
    {   
        Sphere *sphere;
        Sphere *light;
        cudaMalloc((void **)&sphere, sizeof(Sphere));
        cudaMalloc((void **)&light, sizeof(Sphere));

        if (sf::Keyboard::isKeyPressed(sf::Keyboard::A))
        {
            mov -= 3;
        }
        if (sf::Keyboard::isKeyPressed(sf::Keyboard::D))
        {
            mov += 3;
        }
        window.clear();
        cudaError_t status;
        float3* pixeld = new float3[W*H];
        float3** pixeld_d = new float3*[W*H];
        status = cudaMallocManaged((void **)&pixeld_d,(W*H)*sizeof(float3));
        status = cudaMemcpy(pixeld_d,pixeld, (W*H) * sizeof(float3), cudaMemcpyHostToDevice);
        addKernel << <W,H >> > (pixeld_d);
        std::cout << cudaGetErrorString(status);
        gpuErrchk( cudaPeekAtLastError() );
        gpuErrchk( cudaDeviceSynchronize() );
        cudaMemcpy(pixeld,pixeld_d,(W*H)*sizeof(float3), cudaMemcpyDeviceToHost);
        std::cout << pixeld[399359].x;
        cudaFree(pixeld_d);
        for (int x = 0; x < W; x++)
        {
            for (int y = 0; y < H; y++)
            {
                sf::Color pixel;
                pixel.r = pixeld[x*W*y].x;
                pixel.g = pixeld[x*W*y].y;
                pixel.b = pixeld[x*W*y].z;
                image.setPixel(x, y, pixel);
            }
        }
        tex.loadFromImage(image);
        sprite.setTexture(tex, true);
        window.draw(sprite);
        window.display();
    }

//,*sphere,*light

    return 0;
}
´´´

Please provide a [mcve]. These effects are often due to wrong kernel calls. Also, do proper error checking. — Ander Biguri, Mar 26 '19 at 14:07
https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api — Ander Biguri, Mar 26 '19 at 14:11
@AnderBiguri because pixeld_d is a float3 *. Thanks for helping me — Frieder Hannenheim, Mar 26 '19 at 14:53
Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/190707/discussion-between-frieder-muller-and-ander-biguri). — Frieder Hannenheim, Mar 26 '19 at 15:12
We want **just enough** code to reliably reproduce the problem - i.e. a [mcve]. You'll need to create a **minimal** program, and [edit] it into your question. — Toby Speight, Mar 26 '19 at 16:54

Michael Kenzel · Answer 1 · 2019-03-26T16:12:42.113

Your program has undefined behavior. Due to array decay, this

__global__ void addKernel(float3 *pixeld_d[])

is equivalent to

__global__ void addKernel(float3 **pixeld_d)

So you have declared your kernel function to take a pointer to a pointer to a float3 as input argument. I'm speculating here, but I would guess that this is most likely what originally caused you to introduce all the following issues in an attempy to make the compiler shut up and compile the code. What you actually wanted to write is

__global__ void addKernel(float3 *pixeld_d)

i.e., pass your kernel a pointer to an array of float3 into which it should write the result.

On the host side, you have your pixeld_d, which is a pointer to an array of pointers to float3 initialized to point to a dynamically-allocated array of pointers

    float3** pixeld_d = new float3*[W*H];

I'm speculating again, but most likely, you actually wanted this to be just a float3*, but the compiler wouldn't allow you to use that as an argument in your kernel call. Right after that, you immediately overwrite that pointer with the result of a device memory allocation, leaking the previously allocated host memory in the process:

    status = cudaMallocManaged((void **)&pixeld_d,(W*H)*sizeof(float3));

Note that the types don't match here. You allocate a buffer for an array of float3 (presumably because that's what you actually wanted) rather than an array of float3*, which is what the types you're using at this point would mandate. &pixel_d is actually a float3***. So the compiler would have caught your mistake right there, but you forced the compiler to shut up with a C-style cast. This is the first place where you invoke undefined behavior. Unfortunately, this kind of error will typically not result in a crash and your program will just continue to behave as expected.

You then go ahead and launch your kernel, which performs the following operation:

    pixeld_d[threadIdx.x + W *blockIdx.x] = &make_float3(255, 30, 123);

Here, you're attempting to assign the address of a temporary object (the result of make_float3()) to each element of your float3 array. I'm not sure how you managed to compile this code as it's not legal C++ and any C++ compiler (nvcc included) should refuse to compile it. Even if you did somehow manage to compile this: These temporary objects will automatically be destroyed at the end of this line and the pointers you got there wouldn't point to a valid object anymore. I'm speculating again, but I would assume that this was also just done in an attempt to make the compiler shut up due to the mismatching types. pixeld_d[i] is actually a float3* rather than a float3 because the type of pointer you're using here doesn't match the type of buffer you're actually trying to use.

The morale of the story: Don't just make arbitrary changes to your code until the compiler shuts up. Try to understand why it's refusing to compile code. Usually, the reasons are that one is trying to do something that doesn't make sense. Change the code only once you understood what the problem was and how to fix it…and don't use C-style casts in C++…

That's great, it's always good to figure out stuff on your own :-) I'll just leave this here as an explanation anyways. — Michael Kenzel, Mar 26 '19 at 16:13

score 0 · Answer 2 · answered Mar 26 '19 at 16:08

0

I had to remove * in __global__ void addKernel(float3 *pixeld_d[]) and remove the & in front of make_float3

answered Mar 26 '19 at 16:08

Frieder Hannenheim

1,144
1
7
11

Cuda Writing to an array on the device doesn't change value

2 Answers2