I am wondering why my first few thousand results are coming correct and then i am getting old values in cuda programming?

Question

#include "opencv2\opencv.hpp"
#include <stdint.h>
#include <stdio.h>
#include <cuda.h>

using namespace cv;
using namespace std;
#define count 200000 


__global__
void SubArrays(int * a, int * b, int size)
{
    int id = blockIdx.x * blockDim.x + threadIdx.x;

    if (id < size)
    {
        a[id] -= b[id];
    }

}


int image1[count];
int image2[count];

int main(int argv, char** argc)
{

    Mat im1 = imread("1.jpg", CV_LOAD_IMAGE_GRAYSCALE);
    Mat im2 = imread("2.jpg", CV_LOAD_IMAGE_GRAYSCALE);
    int size = (im1.rows*im1.cols);

    printf("size: %d \n\n\n\n", size);

    int i = 0;

    for (int r = 0; r < im1.rows; r++)
    {
        for (int c =0; c< im1.cols; c++,i++)
        {
            image1[i] = im1.at<uint8_t>(r, c);
            image2[i] = im2.at<uint8_t>(r, c);
        }
    }


    printf("This is first image array's first 5 elements\n\n");
    for (int b = 0; b < 5; b++)
    {
        printf("%d\n",image1[b]);
    }

    printf("This is second image array's first 5 elements\n\n");
    for (int b = 0; b < 5; b++)
    {
        printf("%d\n", image2[b]);
    }

    int * h_a = image1;
    int * h_b = image2;

    int * d_a;
    int * d_b;

    cudaMalloc(&d_a, sizeof(char)*size);
    cudaMalloc(&d_b, sizeof(char)*size);

    cudaMemcpy(d_a, h_a, sizeof(char)*size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, sizeof(char)*size, cudaMemcpyHostToDevice);

    SubArrays << <1, 1024 >> >(d_a, d_b, size);

    cudaMemcpy(h_a, d_a, sizeof(char)*size, cudaMemcpyDeviceToHost);
    printf("calculating result\n\n");

    for (int check = 0; check < size ; check++)
    {
        printf("%d \n", h_a[check]);
    }

    cudaFree(d_a);
    cudaFree(d_b);

    return 0;
}

when my kernel runs it gives first few value i.e the difference correct and then it starts showing old values ? what am I doing wrong ? I am noob although i think these are the numbers of blocks and threads that I have chosen wrong. my gpu CC is 3.2

Why are you only running 1 block of 1024 threads? If the images have more than 1024 pixels, then a large amount of the image would be unprocessed — talonmies, Aug 02 '16 at 14:55
@talonmies how many blocks and threads should i use then? my image size is about 340*507 like nearly 2 lac. — Muhammad Abu Bakr, Aug 02 '16 at 15:00
Your kernel requires that you run one thread for every input pixel. — talonmies, Aug 02 '16 at 15:06
@talonmies sir now i selected 196 blocks with 1024 threads and again after several thousands answer becomes wrong. please tell me what should i do? it will be a great favor — Muhammad Abu Bakr, Aug 02 '16 at 16:01

score 1 · Accepted Answer · edited May 23 '17 at 12:22

There are at least 2 errors in your code.

As already pointed out, your kernel assumes that there will be 1 thread per pixel. You must launch enough threads to cover all the pixels in your image. We can fix this by increasing the block count.
You are working with an incorrect size for your copy operations. Both your image1 and image2 arrays are specified as int arrays, and your kernel correspondingly accepts int * parameters. You've loaded (apparently) a unsigned char image, but in the process of loading that image, you have converted each pixel from an 8-bit quantity to a 32-bit quantity here:
```
int size = (im1.rows*im1.cols);

printf("size: %d \n\n\n\n", size);

int i = 0;

for (int r = 0; r < im1.rows; r++)
{
    for (int c =0; c< im1.cols; c++,i++)
    {
        image1[i] = im1.at<uint8_t>(r, c);  // this converts 8bit to 32bit
```
In the above code your size variable is correctly computed, but it now refers to a size of 32-bit (int) quantities, rather than a size of 8-bit (uint8_t) quantities. Therefore, when you do your copy operations between host and device:
```
cudaMemcpy(d_a, h_a, sizeof(char)*size, cudaMemcpyHostToDevice);
                     ^^^^^^^^^^^
```
the use of sizeof(char) is incorrect. You are now handling int quantities, you should be using sizeof(int) everywhere.

The following worked example has those problems fixed, and seems to work correctly, with the openCV dependency removed:

$ cat t1222.cu
#include <stdint.h>
#include <stdio.h>

using namespace std;
#define count 200000


__global__
void SubArrays(int * a, int * b, int size)
{
    int id = blockIdx.x * blockDim.x + threadIdx.x;

    if (id < size)
    {
        a[id] -= b[id];
    }

}


int image1[count];
int image2[count];

int main(int argv, char** argc)
{

    int i = 0;
    for (i = 0; i < count; i++)
        {
            image1[i] = 3;
            image2[i] = 1;
        }

    int size = count;
    printf("This is first image array's first 5 elements\n\n");
    for (int b = 0; b < 5; b++)
    {
        printf("%d\n",image1[b]);
    }

    printf("This is second image array's first 5 elements\n\n");
    for (int b = 0; b < 5; b++)
    {
        printf("%d\n", image2[b]);
    }

    int * h_a = image1;
    int * h_b = image2;

    int * d_a;
    int * d_b;

    cudaMalloc(&d_a, sizeof(int)*size);
    cudaMalloc(&d_b, sizeof(int)*size);

    cudaMemcpy(d_a, h_a, sizeof(int)*size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, sizeof(int)*size, cudaMemcpyHostToDevice);

    SubArrays << <(count + 1023)/1024, 1024 >> >(d_a, d_b, size);

    cudaMemcpy(h_a, d_a, sizeof(int)*size, cudaMemcpyDeviceToHost);
    printf("calculating result\n\n");

    for (int check = 0; check < size ; check++)
    {
        if (h_a[check] != 2){printf("mismatch at %d, was: %d  should be 2\n", check, h_a[check]); return -1;}
    }
    printf("Success!\n");
    cudaFree(d_a);
    cudaFree(d_b);

    return 0;
}
$ nvcc -o t1222 t1222.cu
$ cuda-memcheck ./t1222
========= CUDA-MEMCHECK
This is first image array's first 5 elements

3
3
3
3
3
This is second image array's first 5 elements

1
1
1
1
1
calculating result

Success!
========= ERROR SUMMARY: 0 errors
$

I always recommend you use proper cuda error checking any time you are having trouble with a CUDA code (although I did not add it here) and also run your codes with cuda-memcheck (as I have demonstrated above).

Sir it really helped me !! it was very thorough explanation !! thank you so much — Muhammad Abu Bakr, Aug 02 '16 at 17:22

I am wondering why my first few thousand results are coming correct and then i am getting old values in cuda programming?

1 Answers1