0

I have the following code to copy from a host variable to a __constant__ variable in CUDA

int main(int argc, char **argv){

    int exit_code;

    if (argc < 4) {
        std::cout << "Usage: \n " << argv[0] << " <input> <output> <nColors>" << std::endl;
        return 1;
    }

    Color *h_input;
    int h_rows, h_cols;

    timer1.Start();
    exit_code = readText2RGB(argv[1], &h_input, &h_rows, &h_cols);
    timer1.Stop();
    std::cout << "Reading: " << timer1.Elapsed() << std::endl;

    if (exit_code != SUCCESS){
        std::cout << "Error trying to read file." << std::endl;
        return FAILURE;
    }

    CpuTimer timer1;
    GpuTimer timer2;
    float timeStep2 = 0, timeStep3 = 0;

    int h_numColors = atoi(argv[3]);

    int h_change = 0;
    int *h_pixelGroup = new int[h_rows*h_cols];
    Color *h_groupRep = new Color[h_numColors];
    Color *h_output = new Color[h_rows*h_cols];

    Color *d_input;
    int *d_pixelGroup;
    Color *d_groupRep;
    Color *d_output;

    dim3 block(B_WIDTH, B_HEIGHT);
    dim3 grid((h_cols+B_WIDTH-1)/B_WIDTH, (h_rows+B_HEIGHT-1)/B_HEIGHT);

    checkCudaError(cudaMalloc((void**)&d_input, sizeof(Color)*h_rows*h_cols));
    checkCudaError(cudaMalloc((void**)&d_pixelGroup, sizeof(int)*h_rows*h_cols));
    checkCudaError(cudaMalloc((void**)&d_groupRep, sizeof(Color)*h_numColors));
    checkCudaError(cudaMalloc((void**)&d_output, sizeof(Color)*h_rows*h_cols));

    //       STEP 1
    //Evenly distribute all pixels of the image onto the color set
    timer2.Start();
    checkCudaError(cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int)));
    checkCudaError(cudaMemcpyToSymbol(c_cols, &h_cols, sizeof(int)));
    checkCudaError(cudaMemcpyToSymbol(c_numColors, &h_numColors, sizeof(int)));
    checkCudaError(cudaMemcpy(d_input, h_input, sizeof(Color)*h_rows*h_cols, cudaMemcpyHostToDevice));

    clut_distributePixels<<<grid, block>>>(d_pixelGroup);
    checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
    timer2.Stop();
    std::cout << "Phase 1: " << timer2.Elapsed() << std::endl;

    std::cout << h_pixelGroup[0] << ","
                << h_pixelGroup[3] << ","
                << h_pixelGroup[4] << ","
                << h_pixelGroup[7] << ","
                << h_pixelGroup[8] << std::endl;

    //Do the STEP 2 and STEP 3 as long as there is at least one change of representative in a group
    do {
        //      STEP 2
        //Set the representative value to the average colour of all pixels in the same set
        timer1.Start();
        for (int ng = 0; ng < h_numColors; ng++) {
            int r = 0, g = 0, b = 0;
            int elem = 0;
            for (int i = 0; i < h_rows; i++) {
                for (int j = 0; j < h_cols; j++) {
                    if (h_pixelGroup[i*h_cols+j] == ng) {
                        r += h_input[i*h_cols+j].r;
                        g += h_input[i*h_cols+j].g;
                        b += h_input[i*h_cols+j].b;
                        elem++;
                    }
                }
            }
            if (elem == 0) {
                h_groupRep[ng].r = 255;
                h_groupRep[ng].g = 255;
                h_groupRep[ng].b = 255;
            }else{
                h_groupRep[ng].r = r/elem;
                h_groupRep[ng].g = g/elem;
                h_groupRep[ng].b = b/elem;
            }
        }
        timer1.Stop();
        timeStep2 += timer1.Elapsed();

        //      STEP 3
        //For each pixel in the image, compute Euclidean's distance to each representative
        //and assign it to the set which is closest
        h_change = 0;

        timer2.Start();
        checkCudaError(cudaMemcpyToSymbol(d_change, &h_change, sizeof(int)));
        checkCudaError(cudaMemcpy(d_groupRep, h_groupRep, sizeof(Color)*h_numColors, cudaMemcpyHostToDevice));

        clut_checkDistances<<<grid, block>>>(d_input, d_pixelGroup, d_groupRep);
        checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
        checkCudaError(cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int)));
        timer2.Stop();
        timeStep3 += timer2.Elapsed();

        std::cout << "Chunche" << std::endl;

    } while (h_change == 1);

    std::cout << "Phase 2: " << timeStep2 << std::endl;
    std::cout << "Phase 3: " << timeStep3 << std::endl;

    //      STEP 4
    //Create the new image with the resulting color lookup table
    timer2.Start();
    clut_createImage<<<grid, block>>>(d_output, d_pixelGroup, d_groupRep);
    checkCudaError(cudaMemcpy(h_output, d_output, sizeof(Color)*h_rows*h_cols, cudaMemcpyDeviceToHost));
    timer2.Stop();
    std::cout << "Phase 4: " << timer2.Elapsed() << std::endl;

    checkCudaError(cudaFree(d_input));
    checkCudaError(cudaFree(d_pixelGroup));
    checkCudaError(cudaFree(d_groupRep));
    checkCudaError(cudaFree(d_output));

    timer1.Start();
    exit_code = writeRGB2Text(argv[2], h_input, h_rows, h_cols);
    timer1.Stop();
    std::cout << "Writing: " << timer1.Elapsed() << std::endl;

    delete[] h_pixelGroup;
    delete[] h_groupRep;
    delete[] h_output;

    return SUCCESS;
}

when I print from within the kernel I get zeros for the three values

__global__
void clut_distributePixels(int *pixelGroup){
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;

    if(i == 0 && j == 0){
        printf("a: %d\n", c_rows);
        printf("b: %d\n", c_cols);
        printf("c: %d\n", c_numColors);
    }

    while (i < c_rows) {
        while (j < c_cols) {
            pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
            j += gridDim.x * blockDim.x;
        }
        j = blockDim.x * blockIdx.x + threadIdx.x;
        i += gridDim.y * blockDim.y;
    }

}

Either I am not copying correctly to constant memory or ... I don't know what could be wrong. Any advise !? I posted the entire host code probably something else is messing with the constant copies.

UPDATE

Main.cu

#include "Imageproc.cuh"
int main(){
  int h_change = 0;
  int h_rows = 512;
  cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int));
  chunche<<<1,1>>>();
  cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int));

  std::cout << "H = " << h_change << std::endl;
  return 0
}

Imageproc.cuh

#ifndef _IMAGEPROC_CUH_
#define _IMAGEPROC_CUH_

#include "Utilities.cuh"

#define B_WIDTH     16
#define B_HEIGHT    16

__constant__ int c_rows;
__constant__ int c_cols;
__constant__ int c_numColors;

__device__ int d_change;

    #ifdef __cplusplus
        extern "C"
        {
    #endif
        __global__
        void chunche();
        __global__
        void clut_distributePixels(int *pixelGroup);
        __global__
        void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep);
        __global__
        void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep);
    #ifdef __cplusplus
        }
    #endif

#endif

Imageproc.cu

#include "Imageproc.cuh"

__global__
void chunche(){
    d_change = c_rows + 1;
}

__global__
void clut_distributePixels(int *pixelGroup){
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;

    while (i < c_rows) {
        while (j < c_cols) {
            pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
            j += gridDim.x * blockDim.x;
        }
        j = blockDim.x * blockIdx.x + threadIdx.x;
        i += gridDim.y * blockDim.y;
    }

}

__global__
void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep){
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;
    int newGroup;

    while (i < c_rows) {
        while (j < c_cols) {
            newGroup = 0;
            for (int ng = 1; ng < c_numColors; ng++) {
                if (
                    /*If distance from color to group ng is less than distance from color to group idx
                     then color should belong to ng*/
                    (groupRep[ng].r-input[i*c_cols+j].r)*(groupRep[ng].r-input[i*c_cols+j].r) +
                    (groupRep[ng].g-input[i*c_cols+j].g)*(groupRep[ng].g-input[i*c_cols+j].g) +
                    (groupRep[ng].b-input[i*c_cols+j].b)*(groupRep[ng].b-input[i*c_cols+j].b)
                    <
                    (groupRep[newGroup].r-input[i*c_cols+j].r)*(groupRep[newGroup].r-input[i*c_cols+j].r)+
                    (groupRep[newGroup].g-input[i*c_cols+j].g)*(groupRep[newGroup].g-input[i*c_cols+j].g)+
                    (groupRep[newGroup].b-input[i*c_cols+j].b)*(groupRep[newGroup].b-input[i*c_cols+j].b)
                    )
                {
                    newGroup = ng;
                }
            }

            if (pixelGroup[i*c_cols+j] != newGroup) {
                pixelGroup[i*c_cols+j] = newGroup;
                d_change = 1;
            }

            j += gridDim.x * blockDim.x;
        }
        j = blockDim.x * blockIdx.x + threadIdx.x;
        i += gridDim.y * blockDim.y;
    }

}

__global__
void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep){
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;

    while (i < c_rows) {
        while (j < c_cols) {
            clutImage[i*c_cols+j].r = groupRep[pixelGroup[i*c_cols+j]].r;
            clutImage[i*c_cols+j].g = groupRep[pixelGroup[i*c_cols+j]].g;
            clutImage[i*c_cols+j].b = groupRep[pixelGroup[i*c_cols+j]].b;
            j += gridDim.x * blockDim.x;
        }
        j = blockDim.x * blockIdx.x + threadIdx.x;
        i += gridDim.y * blockDim.y;
    }
}

Utilities.cuh

#ifndef _UTILITIES_CUH_
#define _UTILITIES_CUH_

#include <iostream>
#include <fstream>
#include <string>

#define SUCCESS     1
#define FAILURE     0

#define checkCudaError(val) check( (val), #val, __FILE__, __LINE__)

typedef struct {
    int r;
    int g;
    int b;
} vec3u;

typedef vec3u Color;
typedef unsigned char uchar;
typedef uchar Grayscale;

struct GpuTimer{
    cudaEvent_t start;
    cudaEvent_t stop;
    GpuTimer(){
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    }
    ~GpuTimer(){
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }
    void Start(){
        cudaEventRecord(start, 0);
    }
    void Stop(){
        cudaEventRecord(stop, 0);
    }
    float Elapsed(){
        float elapsed;
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&elapsed, start, stop);
        return elapsed;
    }
};

template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
    if (err != cudaSuccess) {
        std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
        std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
        exit(1);
    }
}

int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols);
int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols);

int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols);
int readText2RGB(const std::string filename, Color **image, int *rows, int *cols);

struct CpuTimer{
    clock_t start;
    clock_t stop;
    void Start(){
        start = clock();
    }
    void Stop(){
        stop = clock();
    }
    float Elapsed(){
        return ((float)stop-start)/CLOCKS_PER_SEC * 1000.0f;
    }
};

#endif

Utilities.cu

#include "Utilities.cuh"

int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols){    
    std::ofstream fileWriter(filename.c_str());
    if (!fileWriter.is_open()) {
        std::cerr << "** writeGrayscale2Text() ** : Unable to open file." << std::endl;
        return FAILURE;
    }
    fileWriter << rows << "\n";
    fileWriter << cols << "\n";
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            fileWriter << (int)image[i*cols+j] << "\n";
        }
    }
    fileWriter.close();
    return SUCCESS;
}

int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols){
    std::ifstream fileReader(filename.c_str());
    if (!fileReader.is_open()) {
        std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
        return FAILURE;
    }
    fileReader >> *rows;
    fileReader >> *cols;
    *image = new Grayscale[(*rows)*(*cols)];
    int value;
    for (int i = 0; i < *rows; i++) {
        for (int j = 0; j < *cols; j++) {
            fileReader >> value;
            (*image)[i*(*cols)+j] = (Grayscale)value;
        }
    }
    fileReader.close();
    return SUCCESS;
}

int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols){
    std::ofstream fileWriter(filename.c_str());
    if (!fileWriter.is_open()) {
        std::cerr << "** writeRGB2Text() ** : Unable to open file." << std::endl;
        return FAILURE;
    }
    fileWriter << rows << "\n";
    fileWriter << cols << "\n";
    for (int k = 0; k < 3; k++) {
        for (int i = 0; i < rows; i++) {
            for (int j = 0; j < cols; j++) {
                switch (k) {
                    case 0:
                        fileWriter << image[i*cols+j].r << "\n";
                        break;
                    case 1:
                        fileWriter << image[i*cols+j].g << "\n";
                        break;
                    case 2:
                        fileWriter << image[i*cols+j].b << "\n";
                        break;
                }
            }
        }
    }
    fileWriter.close();
    return SUCCESS;
}

int readText2RGB(const std::string filename, Color **image, int *rows, int *cols){
    std::ifstream fileReader(filename.c_str());
    if (!fileReader.is_open()) {
        std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
        return FAILURE;
    }
    fileReader >> *rows;
    fileReader >> *cols;
    *image = new Color[(*rows)*(*cols)];
    for (int k = 0; k < 3; k++) {
        for (int i = 0; i < *rows; i++) {
            for (int j = 0; j < *cols; j++) {
                switch (k) {
                    case 0:
                        fileReader >> (*image)[i*(*cols)+j].r;
                        break;
                    case 1:
                        fileReader >> (*image)[i*(*cols)+j].g;
                        break;
                    case 2:
                        fileReader >> (*image)[i*(*cols)+j].b;
                        break;
                }
            }
        }
    }
    fileReader.close();
    return SUCCESS;
}
einpoklum
  • 118,144
  • 57
  • 340
  • 684
BRabbit27
  • 6,333
  • 17
  • 90
  • 161
  • Where and how are `c_rows`, `c_cols`, `c_numColors`, `d_change` declared? – sgarizvi Jun 08 '13 at 13:23
  • They are in a header file that is included in where the main function is. The declaration is `__constant__ int c_rows` `__constant__ int c_cols` `__constant__ int c_numColors`. Actually, the problem is not at compile time but at runtime. – BRabbit27 Jun 08 '13 at 13:26
  • Can't seem to find any problem with `cudaMemcpyToSymbol`. Are these `__constant__` integers used in any of the other kernels? – sgarizvi Jun 08 '13 at 13:38
  • I don't seem to find any error either. Yes, this `__constant__` values are used in other kernels. – BRabbit27 Jun 08 '13 at 13:40
  • Check if `readText2RGB` is returning correct values in `h_rows` and `h_cols`. – sgarizvi Jun 08 '13 at 13:49
  • Yes it does. In fact, check the update in the question and see that not even with a simpler code works. – BRabbit27 Jun 08 '13 at 13:53
  • For what it is worth I posted the complete code I'm sure there's something messing with the `constant` values and the `device` variable, but I cannot see what's wrong. I tried the simpler code but still is showing 0 as the result. The same simpler code I tried it in a new project and worked nicely. – BRabbit27 Jun 08 '13 at 14:02
  • @talonmies This **IS** actual code and actual filenames. I made a mistake writing the name of the file. If you are not going to help, just avoid posting. – BRabbit27 Jun 08 '13 at 14:56

1 Answers1

6

Constant memory has implicit local scope linkage - answer to this on stack overflow. This means that the cudaMemcpyToSymbol have to be in the same generated .obj file of the kernel where you want to use it. You do your memcopy in Main.cu, but the kernel where you use your canstant memory is in Imageproc.cu. So for the constant values are unknown for the kernel chunche.

A option to solve you're problem can be, to implement a wrapper. Just add a function in Imagepro.cu where you do the cudaMemcpyToSymbol and call the wrapper in Main.cu and pass your desired values for the constant memory in there.

Community
  • 1
  • 1
hubs
  • 1,779
  • 13
  • 19