CUDA median calculation through reduction

Question

I'm probably doing something incredibly stupid, but I can't seem to make this reduction work (there is probably a library that does this already, but this is for self-learning, so please bear with me). I'm trying to find the median of an array of integer entries by taking the median of medians approach, which I've coded below:

__global__ void gpuMedOdd(int *entries, int *med) {
        extern __shared__ int sdata[];

        int tid = threadIdx.x;
        int i = blockIdx.x * blockDim.x + threadIdx.x;

        sdata[tid] = entries[i];
        __syncthreads();

        for(int s = blockDim.x / 3; s > 0; s /= 3) {
                if(tid < s) {
                        int list[3];
                        list[0] = sdata[tid], list[1] = sdata[tid + s], list[2] = sdata[tid + 2 * s];
                        if(list[1] < list[0])
                                swapGpu(list[1], list[0]);
                        if(list[2] < list[0])
                                swapGpu(list[2], list[0]);
                        if(list[2] < list[1])
                                swapGpu(list[2], list[1]);

                        sdata[tid] = list[1];
                }

                __syncthreads();
        }

        *med = sdata[0];
}

I invoke this kernel function as:

gpuMedOdd<<<9, numEntries / 9>>>(d_entries, d_med);

I then copy the value in d_med over into med and print out that value. Unfortunately, this value is always 0, regardless of input. What am I doing wrong?

Edit: I forgot to mention, swapGpu(a, b) is defined as below:

__device__ inline void swapGpu(int a, int b) {
        int dum = a;
        a = b;
        b = dum;
}

Edit2: As suggested below, here is the entirety of the code.

#include <iostream>
#include <fstream>
#include <cstdlib>

#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)

inline void __checkCudaErrors(cudaError err, const char *file, const int line) {
        if(cudaSuccess != err) {
                std::cout << file << "(" << line << ") : CUDA Runtime API error " << (int) err << ": " << cudaGetErrorString(err) << std::endl;
                exit(3);
        }
}

inline void __getLastCudaError(const char *errorMsg, const char *file, const int line) {
        cudaError_t err = cudaGetLastError();
        if(cudaSuccess != err) {
                std::cout << file << "(" << line << ") : getLastCudaError() CUDA error : " << errorMsg << " : (" << (int) err << ") " << cudaGetErrorString(err) << std::endl;
                exit(3);
        }
}

int cpuMin(int *entries, int numEntries) {
        int minVal = entries[0];

        for(int i = 1; i < numEntries; i++)
                if(entries[i] < minVal)
                        minVal = entries[i];

        return minVal;
}

int cpuMax(int *entries, int numEntries) {
        int maxVal = entries[0];

        for(int i = 1; i < numEntries; i++)
                if(entries[i] > maxVal)
                        maxVal = entries[i];

        return maxVal;
}

inline void swap(int a, int b) {
        int dum = a;
        a = b;
        b = dum;
}

__device__ inline void swapGpu(int a, int b) {
        int dum = a;
        a = b;
        b = dum;
}

__global__ void gpuMedOdd(int *entries, int *med, int numEntries) {
        extern __shared__ int sdata[];

        int tid = threadIdx.x;
        int i = blockIdx.x * (blockDim.x * 3) + threadIdx.x;

        if(i + 2 * blockDim.x < numEntries) {
                int list[3];
                list[0] = entries[i], list[1] = entries[i + blockDim.x], list[2] = entries[i + 2 * blockDim.x];
                if(list[1] < list[0])
                        swapGpu(list[1], list[0]);
                if(list[2] < list[0])
                        swapGpu(list[2], list[0]);
                if(list[2] < list[1])
                        swapGpu(list[2], list[1]);

                sdata[tid] = list[1];
        }

        __syncthreads();

        for(int s = blockDim.x / 3; s > 0; s /= 3) {
                if(tid < s && tid + 2 * s < blockDim.x) {
                        int list[3];
                        list[0] = sdata[tid], list[1] = sdata[tid + s], list[2] = sdata[tid + 2 * s];
                        if(list[1] < list[0])
                                swapGpu(list[1], list[0]);
                        if(list[2] < list[0])
                                swapGpu(list[2], list[0]);
                        if(list[2] < list[1])
                                swapGpu(list[2], list[1]);

                        sdata[tid] = list[1];
                }

                __syncthreads();
        }

        *med = sdata[0];
}

__global__ void gpuMin(int *entries, int *min, int numEntries) {
        extern __shared__ int sdata[];

        int tid = threadIdx.x;
        int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;

        if(i + blockDim.x < numEntries)
                sdata[tid] = (entries[i] < entries[i + blockDim.x]) ? entries[i] : entries[i + blockDim.x];
        __syncthreads();

        for(int s = blockDim.x / 2; s > 0; s >>= 1) {
                if(tid < s)
                        sdata[tid] = (sdata[tid] < sdata[tid + s]) ? sdata[tid] : sdata[tid + s];

                __syncthreads();
        }

        *min = sdata[0];
}

__global__ void gpuMax(int *entries, int *max, int numEntries) {
        extern __shared__ int sdata[];

        int tid = threadIdx.x;
        int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;

        if(i + blockDim.x < numEntries)
                sdata[tid] = (entries[i] > entries[i + blockDim.x]) ? entries[i] : entries[i + blockDim.x];
        __syncthreads();

        for(int s = blockDim.x / 2; s > 0; s >>= 1) {
                if(tid < s)
                        sdata[tid] = (sdata[tid] > sdata[tid + s]) ? sdata[tid] : sdata[tid + s];

            __syncthreads();
    }

    *max = sdata[0];
}

int partition(int *entries, int left, int right, int pivotIdx) {
        int i, storeIdx = left, pivot = entries[pivotIdx];

        swap(entries[pivotIdx], entries[right]);

        for(i = left; i < right; i++)
                if(entries[i] < pivot) {
                        swap(entries[i], entries[storeIdx]);
                        storeIdx++;
                }

        return storeIdx;
}

int cpuSelect(int *entries, int left, int right, int k) {
        if(left == right)
                return entries[left];

        int pivotIdx = ((left + right) >> 2) + 1, pivotNewIdx, pivotDist;

        pivotNewIdx = partition(entries, left, right, pivotIdx);

        pivotDist = pivotNewIdx - left + 1;

        if(pivotDist == k)
                return entries[pivotNewIdx];

        else if(k < pivotDist)
                return cpuSelect(entries, left, pivotNewIdx - 1, k);

        else
                return cpuSelect(entries, pivotNewIdx + 1, right, k - pivotDist);
}

int main(int argc, char *argv[]) {
        if(argc != 3) {
                std::cout << "ERROR: Incorrect number of input arguments" << std::endl;
                std::cout << "Proper usage: " << argv[0] << " fileName numEntries" << std::endl;
                exit(1);
        }

        std::ifstream inp(argv[1]);

        if(!inp.is_open()) {
                std::cout << "ERROR: File I/O error" << std::endl;
                std::cout << "Could not find file " << argv[1] << std::endl;
                exit(2);
        }

        int numEntries = atoi(argv[2]), i = 0;

        int *entries = new int[numEntries];

        while(inp >> entries[i] && i < numEntries)
                i++;

        if(i < numEntries) {
                std::cout << "ERROR: File I/O error" << std::endl;
                std::cout << "Command-line input suggested " << numEntries << " entries, but only found " << i << " entries" << std::endl;
                exit(2);
        }

        if(inp >> i) {
                std::cout << "ERROR: File I/O error" << std::endl;
                std::cout << "Command-line input suggested " << numEntries << " entries, but file contains more entries" << std::endl;
                exit(2);
        }

        int min, max;
        int *d_entries, *d_min, *d_max;

        checkCudaErrors(cudaMalloc(&d_entries, sizeof(int) * numEntries));
        checkCudaErrors(cudaMalloc(&d_min, sizeof(int)));
        checkCudaErrors(cudaMalloc(&d_max, sizeof(int)));

        checkCudaErrors(cudaMemcpy(d_entries, entries, sizeof(int) * numEntries, cudaMemcpyHostToDevice));

        gpuMin<<<16, numEntries / 16, numEntries / 16 * sizeof(int)>>>(d_entries, d_min, numEntries);
        getLastCudaError("kernel launch failure");
        gpuMax<<<16, numEntries / 16, numEntries / 16 * sizeof(int)>>>(d_entries, d_max, numEntries);
        getLastCudaError("kernel launch failure");

        checkCudaErrors(cudaMemcpy(&min, d_min, sizeof(int), cudaMemcpyDeviceToHost));
        checkCudaErrors(cudaMemcpy(&max, d_max, sizeof(int), cudaMemcpyDeviceToHost));

        std::cout << "The minimum value is: " << min << std::endl;
        std::cout << "The maximum value is: " << max << std::endl;

        if(numEntries % 2) {
                int med, *d_med;
                checkCudaErrors(cudaMalloc(&d_med, sizeof(int)));

                gpuMedOdd<<<16, numEntries / 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);
                getLastCudaError("kernel launch failure");

                checkCudaErrors(cudaMemcpy(&med, d_med, sizeof(int), cudaMemcpyDeviceToHost));

                std::cout << "The median value is: " << med << std::endl;
        }
        else {
                int *d_med;
                cudaMalloc(&d_med, sizeof(int));
                gpuMedOdd<<<16, numEntries / 16>>>(d_entries, d_med, numEntries);
        }

        min = cpuMin(entries, numEntries);

        max = cpuMax(entries, numEntries);

        if(numEntries % 2) {
                int median = cpuSelect(entries, 0, numEntries - 1, (numEntries - 1) / 2 + 1);
                std::cout << "The median value is: " << median << std::endl;
        }

        else {
                int med2 = cpuSelect(entries, 0, numEntries - 1, numEntries / 2);
                int med1 = cpuSelect(entries, 0, numEntries - 1, numEntries / 2 + 1);

                float median = 0.5 * (med1 + med2);
                std::cout << "The median value is: " << median << std::endl;
        }

        std::cout << "The minimum value is: " << min << std::endl;
        std::cout << "The maximum value is: " << max << std::endl;

        exit(0);
}

are you doing [error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) on all cuda calls and kernel calls? — Robert Crovella, Mar 01 '13 at 23:57
To be fair, I am not doing error checking on CUDA calls, but as I mentioned in a comment below, I use the same data in a different kernel invocation (one that finds the minimum of the array) and get the correct result. I'm assuming that means the data has been copied over to the device properly. The only problem would then be the kernel itself, right? — wolfPack88, Mar 02 '13 at 00:00
I built a simple program around your code, including the changes you discussed to kernel invocation below to account for shared memory. I didn't change any of the code you had written, only added stuff around it to make a program. The program I wrote worked correctly for me (I get a non-zero result). Thus I think there is a problem outside of the code you have posted, and my guess is if you do rigorous error checking, you will discover it. For example, maybe you messed up a parameter to cudaMemcpy. — Robert Crovella, Mar 02 '13 at 00:13
You are correct. Unfortunately, I still don't know the cause of the error. The line is: cudaMemcpy(&med, d_med, sizeof(int), cudaMemcpyDeviceToHost); (where I have declared int med; above). This does help narrow it down however. — wolfPack88, Mar 02 '13 at 01:03
What is the error string that is returned when you parse the error code from the cudaMemcpy call? You would be doing this call after the kernel call. It may be a residual error from the kernel call. What is your value for numEntries? Rather than playing 20 questions, let me give some advice. 1.) This program can't be that long. Post the entire code you are using, and make it simple and compilable. 2.) Rather than posting code lines or segments in these comments, *edit your original question with them*. This makes it much easier for others to read, and more coherent. — Robert Crovella, Mar 02 '13 at 01:17
Done. Sorry about that, and thanks for all the help. I've got both CUDA and CPU versions of the code in this file, though, so it is a bit longer than you might have expected. Once again, thanks for all the help. Edit to add: Input files are generated randomly, and the results are the same regardless. — wolfPack88, Mar 02 '13 at 01:29
Just a couple of comments, in addition to @RobertCrovella's answer, you might already be aware of them; if your numEntries is even, your code in main() is still wrong, and you never free your cuda-allocated memory so you have memory leaks — alrikai, Mar 02 '13 at 02:31

score 1 · Accepted Answer · answered Mar 01 '13 at 23:20

1

One thing that jumps out is that your shared memory size isn't set; that is, you declare your shared memory to be

extern __shared__ int sdata[];

but when you invoke your kernel your launch parameters are

gpuMedOdd<<<9, numEntries / 9>>>(...)

If you're setting your __shared__ memory to be extern, then it's expecting to get the number of bytes for shared memory as the 3rd kernel launch parameter. you should instead have

gpuMedOdd<<<9, numEntries / 9, smem_in_bytes>>>(...)

where smem_in_bytes is the size of shared memory for the kernel. If you don't specify a size, it'll default to 0. Hence in your current code, your __shared__ memory array sdata will be 0 bytes long.

EDIT: here's the link to the relevant part of the CUDA Programming Guide: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration

answered Mar 01 '13 at 23:20

alrikai

4,123
3
24
23

Thank you for the response. I've changed the kernel invocation to gpuMedOdd<<<9, numEntries / 9, numEntries * sizeof(int)>>>, but still get the same result (i.e. 0) – wolfPack88 Mar 01 '13 at 23:30
1

@wolfPack88 the shared memory is per-block, so if you have 1 thread processing 1 element you'd need to have (numEntries / 9)*sizeof(int) – alrikai Mar 01 '13 at 23:33
Sorry, typo: I meant to say that I changed the kernel invocation to gpuMedOdd<<<9, numEntries / 9, numEntries / 3 * sizeof(int)>>> and got the same result (each thread processes 3 elements, I think) – wolfPack88 Mar 01 '13 at 23:35
1

@wolfPack88 Ah but you have 1 thread loading 1 element into shared memory, so you'll have numEntries / 9 threads loading that many element into shared memory. One thing I might recommend (just for debugging) is to launch with a small number of threads with input data that's easily recognizable and print out your shared memory after loading via sdata[tid] = entries[i]; __syncthreads(); to ensure you're getting all of your data. – alrikai Mar 01 '13 at 23:41
Apparently, sdata contains nothing but zeros. This is rather interesting, because I use another kernel invocation directly below this one (with the same d_entries) to find the minimum, and that one outputs the right answer... – wolfPack88 Mar 01 '13 at 23:51

score 1 · Answer 2 · edited May 23 '17 at 12:21

A problem I see in your code is that you seem to have your launch parameters reversed:

gpuMedOdd<<<16, numEntries / 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);

I think you intended:

gpuMedOdd<<< numEntries/16, 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);

The first launch parameter is blocks per grid. The second launch parameter is threads per block. Here I'm assuming you wanted to launch 16 threads per block. If in fact your intent was to launch a fixed number of blocks (16) and have the threads per block vary based on input size, then I think this is not typical of good cuda coding, and it will blow up if your input size gets too large, because you will exceed the max threads per block limit. Also, since your shared memory allocation is fixed (64 bytes), I assume you had intended a fixed number of threads per block.

Another suggestion I have is that rather than just reporting "CUDA Runtime Error" you should parse the error code returned. Take a look at the example link I already mentioned.

CUDA median calculation through reduction

2 Answers2