1

i'm making a sorting algorithm on CUDA for a bigger project and i decided implementing a Bitonic sorting. The number of elements i'll be sorting will be allways a power of two, in fact will be 512. I need an array which will have the final positions because this method will be used for ordering an array that represents the quality matrix of another solution.

fitness is the array i'll sort, numElements is the number of elements, and orden is initially an empty array with numElements positions which will be filled at the very beginning in this way: orden[i]=i. Actually orden is not relevant for this issue but I kept it.

My problem is that some values aren't sorted properly and until now i've been unable to figure out what problem do I have.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <ctime>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include <device_functions.h>
#include "float.h"


__global__ void sorting(int * orden, float * fitness, int numElements);

// Populating array with random values for testing purposes
__global__ void populate( curandState * state, float * fitness{

    curandState localState = state[threadIdx.x];
    int a = curand(&localState) % 500;
    fitness[threadIdx.x] = a;
}

//Curand setup for the populate method 
__global__ void setup_cuRand(curandState * state, unsigned long seed)
{
    int id = threadIdx.x;
    curand_init(seed, id, 0, &state[id]);
}

int main()
{
    float * arrayx;
    int numelements = 512;
    int * orden;
    float arrayCPU[512] = { 0 };
    curandState * state;

    cudaDeviceReset();
    cudaSetDevice(0);
    cudaMalloc(&state, numelements * sizeof(curandState));
    cudaMalloc((void **)&arrayx, numelements*sizeof(float));
    cudaMalloc((void **)&orden, numelements*sizeof(int));






    setup_cuRand << <1, numelements >> >(state, unsigned(time(NULL)));

    populate << <1, numelements >> > (state, arrayx);
    cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < numelements; i++)
        printf("fitness[%i] = %f\n", i, arrayCPU[i]);

    sorting << <1, numelements >> >(orden, arrayx, numelements);
    printf("\n\n");

    cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < numelements; i++)
        printf("fitness[%i] = %f\n", i, arrayCPU[i]);



    cudaDeviceReset();


    return 0;
}
__device__ bool isValid(float n){
    return !(isnan(n) || isinf(n) || n != n || n <= FLT_MIN || n >= FLT_MAX);

}

__global__ void sorting(int * orden, float * fitness, int numElements){
    int i = 0;
    int j = 0;
    float f = 0.0;
    int aux = 0;

    //initial orden registered (1, 2, 3...)
    orden[threadIdx.x] = threadIdx.x;
    //Logarithm on base 2 of numElements
    for (i = 2; i <= numElements; i = i * 2){
        // descending from i reducing to half each iteration
        for (j = i; j >= 2; j = j / 2){

            if (threadIdx.x % j  < j / 2){
                __syncthreads();
                // ascending or descending consideration using (threadIdx.x % (i*2) < i) 
                if ((threadIdx.x % (i * 2) < i) && (fitness[threadIdx.x] >  fitness[threadIdx.x + j / 2] || !isValid(fitness[threadIdx.x])) ||
                    ((threadIdx.x % (i * 2) >= i) && (fitness[threadIdx.x] <= fitness[threadIdx.x + j / 2] || !isValid(fitness[threadIdx.x + j / 2])))){

                    aux = orden[threadIdx.x];
                    orden[threadIdx.x] = orden[threadIdx.x + j / 2];
                    orden[threadIdx.x + j / 2] = aux;
                    //Se reubican los fitness
                    f = fitness[threadIdx.x];
                    fitness[threadIdx.x] = fitness[threadIdx.x + j / 2];
                    fitness[threadIdx.x + j / 2] = f;
                }
            }
        }
    }
}

For example, an output i got on a random execution:

A random execution

This is a representation of my bitonic sorting:

Bitonic sorting Schema, the arrows point where the worst of the values compared goes to

  • 1
    Your code as posted will not compile. – Robert Crovella Jun 27 '16 at 12:49
  • 1
    I would suggest you check whether each step of the sorting is working as expected by changing the loop end condition here `for (i = 2; i <= numElements; i = i * 2){`. You need only repeat this at most 8 times to fish out what is wrong. – kangshiyin Jun 27 '16 at 12:56

1 Answers1

4

Here are the issues I found:

  1. In your posted code, this does not compile:

    __global__ void populate( curandState * state, float * fitness{
                                                                  ^
                                                       missing close parenthesis
    

    I added a close parenthesis there.

  2. It's not necessary to take the address of the array in these cudaMemcpy statements:

    cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
    ....
    cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
    

    the array name is already the address of the array, so I removed the ampersands. If you use a dynamically allocated array, such usage would be broken.

  3. Your usage of __syncthreads() here is broken:

    for (j = i; j >= 2; j = j / 2){
    
        if (threadIdx.x % j  < j / 2){
            __syncthreads();
    

    usage of __syncthreads() inside a conditional statement is generally incorrect unless the conditional statement evaluates uniformly across the threadblock. This is covered in the documentation. We can achieve the desired effect with a slight change:

    for (j = i; j >= 2; j = j / 2){
        __syncthreads();
        if (threadIdx.x % j  < j / 2){
    

With the above changes, your code appears to run correctly for me, for most cases. Your usage of FLT_MIN in your validity check is also questionable, if you intend 0 (or any negative values) to be sorted correctly. Speaking generally, FLT_MIN is a number that is very small, close to zero. If you were thinking that this is a large negative number, it is not. As a result, zero is a possible output of your random number generator, and it will not be sorted correctly. I'll leave this one to you to fix, it should be straightforward, but it will depend on what you ultimately want to achieve. (If you only want to sort positive non-zero floating point values, the test may be OK, but in this case your random number generator can return 0.)

Community
  • 1
  • 1
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • Thanks for taking time for people! The parenthesis was forgotten when i modified the code over the web. I didn't know about `FLT_MIN`, i'll use `-FLT_MAX`, and the use of syncthreads could be who caused the problems, i missed that on my implementation. I'll test it and if it works, will give you the answer. Thank you – Mr.PotatusVII Jun 27 '16 at 16:18