0

This is my kernel function and it does simple work: & every item with blockIdx.x to filter even and odd:

__global__
void g_compact(const unsigned int* v_array, unsigned int* compact_array, int size)
{
    const int p_index = blockIdx.x * size + threadIdx.x;

    if ((v_array[threadIdx.x] & 1) == blockIdx.x)
    {
        compact_array[p_index]= 1;
    }
    else
    {
        compact_array[p_index]= 0;
    }
}

However, it produce random result every time I run the program, like

1 0 1625730008 32767 1625730024 32767 4197775 0 0 0 4197470 0 0 0 2525809656 32630 1 0 1625729712 32767

What confuse me is that the result is not 0 or 1 since my if and else should cover every situation.

Could someone help me out of this?

Total program:

#include <iostream>

void print_array(const unsigned int* v_array, int size)
{
    for (int i = 0; i < size; ++i)
    {
        std::cout<<v_array[i]<<"  ";
    }
    std::cout<<std::endl;
}

__global__
void g_compact(const unsigned int* v_array, unsigned int* compact_array, int size)
{
    const int p_index = blockIdx.x * size + threadIdx.x;

    if (true)
    {
        compact_array[p_index]= 1;
    }
    else
    {
        compact_array[p_index]= 0;
    }
}

int main(int argc, char const *argv[])
{
    unsigned int *d_in;
    unsigned int *d_out;

    cudaMalloc(&d_in,  sizeof(unsigned int) * 10);
    cudaMalloc(&d_out, sizeof(unsigned int) * 20);

    unsigned int h_array[10] = {
        1, 2, 3, 4,
        5, 6, 7, 8,
        9, 10
    };

    cudaMemcpy(d_in, h_array, sizeof(unsigned int) * 10, cudaMemcpyHostToDevice);

    g_compact<<<2, 10>>>(h_array, d_out, 10);

    unsigned int h_out[20];
    cudaMemcpy(h_out, d_out, sizeof(unsigned int) * 20, cudaMemcpyDeviceToHost);

    print_array(h_out, 20);

    return 0;
}
zbuzch
  • 11
  • 3
  • A good cuda error checking would have helped you giving you clues on what was happening, maybe you could have a look at : http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api for your future CUDA developments – X3liF Dec 22 '16 at 10:14

1 Answers1

1

The problem is not how you write your kernel function but how you call it:

unsigned int h_array[10] = {
    1, 2, 3, 4,
    5, 6, 7, 8,
    9, 10
};

cudaMemcpy(d_in, h_array, sizeof(unsigned int) * 10, cudaMemcpyHostToDevice);

g_compact<<<2, 10>>>(h_array, d_out, 10);

You pass a host pointer (h_array) to a kernel function. How does it suppose to work?

I think you want to change h_array to d_in

g_compact<<<2, 10>>>(d_in, d_out, 10);
Rahn
  • 4,787
  • 4
  • 31
  • 57