-2
#include "opencv2\opencv.hpp"
#include <stdint.h>
#include <stdio.h>
#include <cuda.h>

using namespace cv;
using namespace std;
#define count 200000 


__global__
void SubArrays(int * a, int * b, int size)
{
    int id = blockIdx.x * blockDim.x + threadIdx.x;

    if (id < size)
    {
        a[id] -= b[id];
    }

}


int image1[count];
int image2[count];

int main(int argv, char** argc)
{

    Mat im1 = imread("1.jpg", CV_LOAD_IMAGE_GRAYSCALE);
    Mat im2 = imread("2.jpg", CV_LOAD_IMAGE_GRAYSCALE);
    int size = (im1.rows*im1.cols);

    printf("size: %d \n\n\n\n", size);

    int i = 0;

    for (int r = 0; r < im1.rows; r++)
    {
        for (int c =0; c< im1.cols; c++,i++)
        {
            image1[i] = im1.at<uint8_t>(r, c);
            image2[i] = im2.at<uint8_t>(r, c);
        }
    }


    printf("This is first image array's first 5 elements\n\n");
    for (int b = 0; b < 5; b++)
    {
        printf("%d\n",image1[b]);
    }

    printf("This is second image array's first 5 elements\n\n");
    for (int b = 0; b < 5; b++)
    {
        printf("%d\n", image2[b]);
    }

    int * h_a = image1;
    int * h_b = image2;

    int * d_a;
    int * d_b;

    cudaMalloc(&d_a, sizeof(char)*size);
    cudaMalloc(&d_b, sizeof(char)*size);

    cudaMemcpy(d_a, h_a, sizeof(char)*size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, sizeof(char)*size, cudaMemcpyHostToDevice);

    SubArrays << <1, 1024 >> >(d_a, d_b, size);

    cudaMemcpy(h_a, d_a, sizeof(char)*size, cudaMemcpyDeviceToHost);
    printf("calculating result\n\n");

    for (int check = 0; check < size ; check++)
    {
        printf("%d \n", h_a[check]);
    }

    cudaFree(d_a);
    cudaFree(d_b);

    return 0;
}

when my kernel runs it gives first few value i.e the difference correct and then it starts showing old values ? what am I doing wrong ? I am noob although i think these are the numbers of blocks and threads that I have chosen wrong. my gpu CC is 3.2

too honest for this site
  • 12,050
  • 4
  • 30
  • 52

1 Answers1

1

There are at least 2 errors in your code.

  1. As already pointed out, your kernel assumes that there will be 1 thread per pixel. You must launch enough threads to cover all the pixels in your image. We can fix this by increasing the block count.

  2. You are working with an incorrect size for your copy operations. Both your image1 and image2 arrays are specified as int arrays, and your kernel correspondingly accepts int * parameters. You've loaded (apparently) a unsigned char image, but in the process of loading that image, you have converted each pixel from an 8-bit quantity to a 32-bit quantity here:

    int size = (im1.rows*im1.cols);
    
    printf("size: %d \n\n\n\n", size);
    
    int i = 0;
    
    for (int r = 0; r < im1.rows; r++)
    {
        for (int c =0; c< im1.cols; c++,i++)
        {
            image1[i] = im1.at<uint8_t>(r, c);  // this converts 8bit to 32bit
    

    In the above code your size variable is correctly computed, but it now refers to a size of 32-bit (int) quantities, rather than a size of 8-bit (uint8_t) quantities. Therefore, when you do your copy operations between host and device:

    cudaMemcpy(d_a, h_a, sizeof(char)*size, cudaMemcpyHostToDevice);
                         ^^^^^^^^^^^
    

    the use of sizeof(char) is incorrect. You are now handling int quantities, you should be using sizeof(int) everywhere.

The following worked example has those problems fixed, and seems to work correctly, with the openCV dependency removed:

$ cat t1222.cu
#include <stdint.h>
#include <stdio.h>

using namespace std;
#define count 200000


__global__
void SubArrays(int * a, int * b, int size)
{
    int id = blockIdx.x * blockDim.x + threadIdx.x;

    if (id < size)
    {
        a[id] -= b[id];
    }

}


int image1[count];
int image2[count];

int main(int argv, char** argc)
{

    int i = 0;
    for (i = 0; i < count; i++)
        {
            image1[i] = 3;
            image2[i] = 1;
        }

    int size = count;
    printf("This is first image array's first 5 elements\n\n");
    for (int b = 0; b < 5; b++)
    {
        printf("%d\n",image1[b]);
    }

    printf("This is second image array's first 5 elements\n\n");
    for (int b = 0; b < 5; b++)
    {
        printf("%d\n", image2[b]);
    }

    int * h_a = image1;
    int * h_b = image2;

    int * d_a;
    int * d_b;

    cudaMalloc(&d_a, sizeof(int)*size);
    cudaMalloc(&d_b, sizeof(int)*size);

    cudaMemcpy(d_a, h_a, sizeof(int)*size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, sizeof(int)*size, cudaMemcpyHostToDevice);

    SubArrays << <(count + 1023)/1024, 1024 >> >(d_a, d_b, size);

    cudaMemcpy(h_a, d_a, sizeof(int)*size, cudaMemcpyDeviceToHost);
    printf("calculating result\n\n");

    for (int check = 0; check < size ; check++)
    {
        if (h_a[check] != 2){printf("mismatch at %d, was: %d  should be 2\n", check, h_a[check]); return -1;}
    }
    printf("Success!\n");
    cudaFree(d_a);
    cudaFree(d_b);

    return 0;
}
$ nvcc -o t1222 t1222.cu
$ cuda-memcheck ./t1222
========= CUDA-MEMCHECK
This is first image array's first 5 elements

3
3
3
3
3
This is second image array's first 5 elements

1
1
1
1
1
calculating result

Success!
========= ERROR SUMMARY: 0 errors
$

I always recommend you use proper cuda error checking any time you are having trouble with a CUDA code (although I did not add it here) and also run your codes with cuda-memcheck (as I have demonstrated above).

Community
  • 1
  • 1
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257