Sorting Pixels from opengl using CUDA and Thrust (Windows Port Issues...)

Question

I tried to port this example to WINDOWS with GLFW, since I don't have access to Linux box .. but the only thing I get is the clear color and nothing comes up ..

Did others get this example to work / Did I miss something here?

I do not even get the original image, before the sort either ...

#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 

#include <thrust/device_vector.h>
#include <thrust/sort.h>

#include <GL/glew.h>

#include <GL/glfw.h>
#include <cuda_gl_interop.h> 


const int WIDTH=800;  
const int HEIGHT=800;
const int DIM = 800;

GLuint  bufferObj;
cudaGraphicsResource *resource;



struct sort_functor
{
    __host__ __device__
        bool operator()(uchar4 left, uchar4 right) const
    {
        return (left.y < right.y);
    }
};



// create a green/black pattern
__global__ void kernel(uchar4 *ptr) {
    // map from threadIdx/BlockIdx to pixel position 
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    // now calculate the value at that position 
    float fx = x / (float)DIM - 0.5f;
    float fy = y / (float)DIM - 0.5f;
    unsigned char   green = 128 + 127 * sin(abs(fx * 100) - abs(fy * 100));

    // accessing uchar4 vs unsigned char* 
    ptr[offset].x = 0;
    ptr[offset].y = green;
    ptr[offset].z = 0;
    ptr[offset].w = 255;
}


static void sort_pixels(){
    cudaGraphicsMapResources(1, &resource, NULL);
    uchar4* devPtr;
    size_t  size;

    cudaGraphicsResourceGetMappedPointer((void**)&devPtr, &size, resource);

    thrust::device_ptr<uchar4> tptr = thrust::device_pointer_cast(devPtr);
    thrust::sort(tptr, tptr + (DIM*DIM), sort_functor());
    cudaGraphicsUnmapResources(1, &resource, NULL);

}


void GLFWCALL Keyboard_Callback(int key, int action)
{
    if (key == 32 && action == GLFW_PRESS)
        sort_pixels();

    return;
}



int main ()
{

    if( !glfwInit() )
    {
        fprintf( stderr, "Failed to initialize GLFW\n" );
        return -1;
    }

    glfwOpenWindowHint(GLFW_FSAA_SAMPLES, 4);
    glfwOpenWindowHint(GLFW_OPENGL_VERSION_MAJOR, 3);
    glfwOpenWindowHint(GLFW_OPENGL_VERSION_MINOR, 3);
    glfwOpenWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);

    if( !glfwOpenWindow( WIDTH, HEIGHT, 0,0,0,0, 32,0, GLFW_WINDOW ) )
    {
        fprintf( stderr, "Failed to open GLFW window. If you have an Intel GPU, they are not 3.3 compatible.\n" );
        glfwTerminate();
        return -1;
    }

    glewExperimental = GL_TRUE; 
    if (glewInit() != GLEW_OK) {
        fprintf(stderr, "Failed to initialize GLEW\n");
        return -1;
    }

    glfwSetWindowTitle( "Sort Test" );
    glfwEnable( GLFW_STICKY_KEYS );

    glEnable(GL_DEPTH_TEST);
    glDepthFunc(GL_LEQUAL);
    glClearColor(0.087, 0.087, 0.087, 1.0);

    glfwSetKeyCallback(Keyboard_Callback);



    // SORT CODE
    glGenBuffers(1, &bufferObj);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, bufferObj);
    glBufferData(GL_PIXEL_UNPACK_BUFFER, DIM * DIM * 4, NULL, GL_DYNAMIC_DRAW);

    cudaGraphicsGLRegisterBuffer(&resource, bufferObj, cudaGraphicsMapFlagsNone);

    cudaGraphicsMapResources(1, &resource, NULL);
    uchar4* devPtr;
    size_t  size;
    cudaGraphicsResourceGetMappedPointer((void**)&devPtr, &size, resource);

    dim3    grid(DIM / 16, DIM / 16);
    dim3    threads(16, 16);

    kernel << <grid, threads >> >(devPtr);

    cudaGraphicsUnmapResources(1, &resource, NULL);



    do{ 
        glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT);
        glEnable (GL_BLEND);
        glBlendFunc (GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);

        glDrawPixels(DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0);

        glfwSwapBuffers();


    } // Check if the ESC key was pressed or the window was closed
    while( glfwGetKey( GLFW_KEY_ESC ) != GLFW_PRESS &&
           glfwGetWindowParam( GLFW_OPENED ) );

    // Close OpenGL window and terminate GLFW
    glfwTerminate();

    return 0;

}

Edit:

I added :

static void HandleError(cudaError_t err, int line) {
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), "main", line);
        exit(EXIT_FAILURE);
    }
    std::cout << "What happened : " << line << " " << cudaGetErrorString(err)<< std::endl;
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

with and example call:

HandleError(cudaGraphicsGLRegisterBuffer(&resource, bufferObj, cudaGraphicsMapFlagsNone), 134);

Per the comments, no CUDA errors? so does that mean there is something wrong with the binding of the PBO or draw commands?

I would start by adding [proper cuda error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) to all of the CUDA API calls and kernel calls. — Robert Crovella, Jul 30 '14 at 16:05
Followed your advice : no error were returned ... :( So is the binding then? — John Mke, Jul 30 '14 at 16:14

Robert Crovella · Answer 1 · 2014-07-30T22:43:50.633

Here's a version of the program that seems to work for me under windows.

Rather than going through the details of project setup, just drop this code in the simpleGL cuda sample code, in place of the code that is there, and rebuild that project.

windows version:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// OpenGL Graphics includes
#include <GL/glew.h>
#include <GL/freeglut.h>
#include <cuda_gl_interop.h>

#include <thrust/device_ptr.h>
#include <thrust/sort.h>


static void HandleError( cudaError_t err, const char *file,  int line ) {
    if (err != cudaSuccess) {
            printf( "%s in %s at line %d\n", cudaGetErrorString( err ),  file, line );
            exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

#define     DIM    512

GLuint  bufferObj;
cudaGraphicsResource *resource;

struct sort_functor
{
  __host__ __device__
    bool operator()(uchar4 left, uchar4 right) const
    {
      return (left.y < right.y);
    }
};



// create a green/black pattern
__global__ void kernel( uchar4 *ptr ) {
// map from threadIdx/BlockIdx to pixel position
  int x = threadIdx.x + blockIdx.x * blockDim.x;
  int y = threadIdx.y + blockIdx.y * blockDim.y;
  int offset = x + y * blockDim.x * gridDim.x;

// now calculate the value at that position
  float fx = x/(float)DIM - 0.5f;
  float fy = y/(float)DIM - 0.5f;
  unsigned char   green = 128 + 127 * sin( abs(fx*100) - abs(fy*100) );

// accessing uchar4 vs unsigned char*
  ptr[offset].x = 0;
  ptr[offset].y = green;
  ptr[offset].z = 0;
  ptr[offset].w = 255;
}

static void draw_func( void ) {

  glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
  glutSwapBuffers();
}
static void sort_pixels(){
  cudaGraphicsMapResources( 1, &resource, NULL );
  uchar4* devPtr;
  size_t  size;

  cudaGraphicsResourceGetMappedPointer( (void**)&devPtr, &size, resource);

  thrust::device_ptr<uchar4> tptr = thrust::device_pointer_cast(devPtr);
  thrust::sort(tptr, tptr+(DIM*DIM), sort_functor());
  cudaGraphicsUnmapResources( 1, &resource, NULL );
  draw_func();
}

static void key_func( unsigned char key, int x, int y ) {
  switch (key) {
    case 27:
        HANDLE_ERROR( cudaGraphicsUnregisterResource( resource ) );
        glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
        glDeleteBuffers( 1, &bufferObj );
        exit(0);
        break;
    case 32:
        sort_pixels();
        break;
    default:
        break;
  }
}

int main(int argc, char *argv[]) {

  cudaGLSetGLDevice( 0 );

  glutInit( &argc, argv );
  glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
  glutInitWindowSize( DIM, DIM );
  glutCreateWindow( "sort test" );
  glewInit();
  glGenBuffers( 1, &bufferObj );
  glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
  glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4, NULL, GL_DYNAMIC_DRAW_ARB );

  cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone );


  cudaGraphicsMapResources( 1, &resource, NULL );
  uchar4* devPtr;
  size_t  size;

  cudaGraphicsResourceGetMappedPointer( (void**)&devPtr, &size, resource);
  dim3    grid(DIM/16,DIM/16);
  dim3    threads(16,16);
  kernel<<<grid,threads>>>( devPtr );
  cudaGraphicsUnmapResources( 1, &resource, NULL );

// set up GLUT and kick off main loop
  glutKeyboardFunc( key_func );
  glutDisplayFunc( draw_func );
  glutMainLoop();
}

The previous linux version probably has some cruft in it. It wasn't intended to be a paragon of programming, just something to demonstrate the concept, so I picked up a previous example I had laying around. This version is probably a little bit "cleaner".

Also note that the above code works as-is (for me, anyway) on linux if I compile with:

nvcc -arch=sm_20 -o ogltest3 ogltest3.cu -lglut -lGLEW

Here's an updated version, that works on windows or linux, and does not use glDrawPixels:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// OpenGL Graphics includes
#include <GL/glew.h>
#include <GL/freeglut.h>
#include <cuda_gl_interop.h>

#include <thrust/device_ptr.h>
#include <thrust/sort.h>


static void HandleError( cudaError_t err, const char *file,  int line ) {
    if (err != cudaSuccess) {
            printf( "%s in %s at line %d\n", cudaGetErrorString( err ),  file, line );
            exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

#define     DIM    512

GLuint  bufferObj;
GLuint  textureID;
cudaGraphicsResource *resource;

struct sort_functor
{
  __host__ __device__
    bool operator()(uchar4 left, uchar4 right) const
    {
      return (left.y < right.y);
    }
};



// create a green/black pattern
__global__ void kernel( uchar4 *ptr ) {
// map from threadIdx/BlockIdx to pixel position
  int x = threadIdx.x + blockIdx.x * blockDim.x;
  int y = threadIdx.y + blockIdx.y * blockDim.y;
  int offset = x + y * blockDim.x * gridDim.x;

// now calculate the value at that position
  float fx = x/(float)DIM - 0.5f;
  float fy = y/(float)DIM - 0.5f;
  unsigned char   green = 128 + 127 * sin( abs(fx*100) - abs(fy*100) );

// accessing uchar4 vs unsigned char*
  ptr[offset].x = 0;
  ptr[offset].y = green;
  ptr[offset].z = 0;
  ptr[offset].w = 255;
}

static void draw_func( void ) {

//  glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );

  glutSwapBuffers();
}
static void sort_pixels(){
  cudaGraphicsMapResources( 1, &resource, NULL );
  uchar4* devPtr;
  size_t  size;

  cudaGraphicsResourceGetMappedPointer( (void**)&devPtr, &size, resource);

  thrust::device_ptr<uchar4> tptr = thrust::device_pointer_cast(devPtr);
  thrust::sort(tptr, tptr+(DIM*DIM), sort_functor());
  cudaGraphicsUnmapResources( 1, &resource, NULL );
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj);
  glBindTexture(GL_TEXTURE_2D, textureID);
  glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, DIM, DIM, GL_BGRA, GL_UNSIGNED_BYTE, NULL);

  glBegin(GL_QUADS);
  glTexCoord2f( 0, 1.0f);
  glVertex3f(-1.0,1.0f,0);
  glTexCoord2f(0,0);
  glVertex3f(-1.0f,-1.0f,0);
  glTexCoord2f(1.0f,0);
  glVertex3f(1.0f,-1.0f,0);
  glTexCoord2f(1.0f,1.0f);
  glVertex3f(1.0f,1.0f,0);
  glEnd();

  draw_func();
}

static void close_func( void ){
        HANDLE_ERROR( cudaGraphicsUnregisterResource( resource ) );
        glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
        glDeleteBuffers( 1, &bufferObj );
        exit(0);
}

static void key_func( unsigned char key, int x, int y ) {
  switch (key) {
    case 27:
        close_func();
        break;
    case 32:
        sort_pixels();
        break;
    default:
        break;
  }
}

int main(int argc, char *argv[]) {

  cudaGLSetGLDevice( 0 );

  glutInit( &argc, argv );
  glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
  glutInitWindowSize( DIM, DIM );
  glutCreateWindow( "sort test" );
  glewInit();
  glGenBuffers( 1, &bufferObj );
  glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
  glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4, NULL, GL_DYNAMIC_DRAW_ARB );

  cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone );
  glEnable(GL_TEXTURE_2D);
  glGenTextures(1, &textureID);
  glBindTexture(GL_TEXTURE_2D, textureID);
  glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, DIM, DIM, 0, GL_BGRA, GL_UNSIGNED_BYTE, NULL);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);


  cudaGraphicsMapResources( 1, &resource, NULL );
  uchar4* devPtr;
  size_t  size;

  cudaGraphicsResourceGetMappedPointer( (void**)&devPtr, &size, resource);
  dim3    grid(DIM/16,DIM/16);
  dim3    threads(16,16);
  kernel<<<grid,threads>>>( devPtr );
  cudaGraphicsUnmapResources( 1, &resource, NULL );

  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj);
  glBindTexture(GL_TEXTURE_2D, textureID);
  glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, DIM, DIM, GL_BGRA, GL_UNSIGNED_BYTE, NULL);

  glBegin(GL_QUADS);
  glTexCoord2f( 0, 1.0f);
  glVertex3f(-1.0,1.0f,0);
  glTexCoord2f(0,0);
  glVertex3f(-1.0f,-1.0f,0);
  glTexCoord2f(1.0f,0);
  glVertex3f(1.0f,-1.0f,0);
  glTexCoord2f(1.0f,1.0f);
  glVertex3f(1.0f,1.0f,0);
  glEnd();
  draw_func();

// set up GLUT and kick off main loop
  glutCloseFunc( close_func );
  glutKeyboardFunc( key_func );
  glutDisplayFunc( draw_func );
  glutMainLoop();
}

Is glDrawPixels depreciated? I just read so is that giving me an issue on windows .. such I have to map it to a texture? — John Mke, Jul 30 '14 at 17:31
It may be either deprecated or removed depending on OGL version. The code above works for me, if you follow the recommendation I made to drop it into the simpleGL sample project and rebuild that project. Anyway, converting from glDrawPixels to some other rendering approach is [an OpenGL topic](http://stackoverflow.com/questions/11164276/how-to-render-draw-buffer-object-to-framebuffer-without-gldrawpixels), right? I don't think it depends on CUDA in any way. — Robert Crovella, Jul 30 '14 at 17:37
I've updated my answer to add an additional version of the code that does not use `glDrawPixels`, using a texture instead. — Robert Crovella, Jul 30 '14 at 22:44

Sorting Pixels from opengl using CUDA and Thrust (Windows Port Issues...)

1 Answers1

Linked

Related