Using Thrust's reduce operator with Pixel uchar4 data error

Question

I have been having trouble converting this example from sort to reduce.

I keep getting

no suitable conversion function from "uchar4" to "OutputType" exists

When I try to compile and run this modified example:

thrust::reduce(tptr, tptr+(DIM*DIM), int(0), reduce_functor());

Is the crux of my issue with the modified functor ... where I was trying to avoid adding chars but returning the summed int value of the pixels so I can get the average color later on of the image ...

#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 
#include <GL/gl.h> 
#include <GL/glut.h> 
#include <cuda_gl_interop.h> 
#include <GL/glext.h> 
#include <GL/glx.h> 
#include <thrust/device_ptr.h>
#include <thrust/reduce.h>
#define GET_PROC_ADDRESS( str ) glXGetProcAddress( (const GLubyte *)str ) 

static void HandleError( cudaError_t err, const char *file,  int line ) { 
    if (err != cudaSuccess) { 
            printf( "%s in %s at line %d\n", cudaGetErrorString( err ),  file, line ); 
            exit( EXIT_FAILURE ); 
    } 
} 
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) 



PFNGLBINDBUFFERARBPROC    glBindBuffer     = NULL; 
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers  = NULL; 
PFNGLGENBUFFERSARBPROC    glGenBuffers     = NULL; 
PFNGLBUFFERDATAARBPROC    glBufferData     = NULL; 

#define     DIM    512 

GLuint  bufferObj; 
cudaGraphicsResource *resource; 

struct reduce_functor
{
  __host__ __device__
    int operator()(uchar4 left, uchar4 right) const
    {
      return (left.x + right.x) + (left.y + right.y) + (left.z + right.z);
    }
};



// create a green/black pattern
__global__ void kernel( uchar4 *ptr ) { 
// map from threadIdx/BlockIdx to pixel position 
  int x = threadIdx.x + blockIdx.x * blockDim.x; 
  int y = threadIdx.y + blockIdx.y * blockDim.y; 
  int offset = x + y * blockDim.x * gridDim.x; 

// now calculate the value at that position 
  float fx = x/(float)DIM - 0.5f; 
  float fy = y/(float)DIM - 0.5f; 
  unsigned char   green = 128 + 127 * sin( abs(fx*100) - abs(fy*100) ); 

// accessing uchar4 vs unsigned char* 
  ptr[offset].x = 0; 
  ptr[offset].y = green; 
  ptr[offset].z = 0; 
  ptr[offset].w = 255; 
} 

static void draw_func( void ) { 

  glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 ); 
  glutSwapBuffers(); 
}
static void sort_pixels(){
  cudaGraphicsMapResources( 1, &resource, NULL ); 
  uchar4* devPtr; 
  size_t  size; 

  cudaGraphicsResourceGetMappedPointer( (void**)&devPtr, &size, resource); 

  thrust::device_ptr<uchar4> tptr = thrust::device_pointer_cast(devPtr);
  thrust::reduce(tptr, tptr+(DIM*DIM), int(0), reduce_functor());
  cudaGraphicsUnmapResources( 1, &resource, NULL ); 
  draw_func();
}

static void key_func( unsigned char key, int x, int y ) { 
  switch (key) { 
    case 27: 
        HANDLE_ERROR( cudaGraphicsUnregisterResource( resource ) ); 
        glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 ); 
        glDeleteBuffers( 1, &bufferObj ); 
        exit(0); 
        break;
    case 32:
        sort_pixels();
        break;
    default:
        break;
  } 
} 



int main(int argc, char *argv[]) { 

  cudaGLSetGLDevice( 0 ); 

  glutInit( &argc, argv ); 
  glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA ); 
  glutInitWindowSize( DIM, DIM ); 
  glutCreateWindow( "sort test" ); 

  glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer"); 
  glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers"); 
  glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers"); 
  glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData"); 

  glGenBuffers( 1, &bufferObj ); 
  glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj ); 
  glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4, NULL, GL_DYNAMIC_DRAW_ARB ); 


  cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone ); 


  cudaGraphicsMapResources( 1, &resource, NULL ); 
  uchar4* devPtr; 
  size_t  size; 

  cudaGraphicsResourceGetMappedPointer( (void**)&devPtr, &size, resource); 

  dim3    grid(DIM/16,DIM/16); 
  dim3    threads(16,16); 
  kernel<<<grid,threads>>>( devPtr ); 
  cudaGraphicsUnmapResources( 1, &resource, NULL ); 

// set up GLUT and kick off main loop 
  glutKeyboardFunc( key_func ); 
  glutDisplayFunc( draw_func ); 
  glutMainLoop(); 
}

and compile like this

nvcc -arch=sm_20 -o ogltest ogltest.cu -lglut

Robert Crovella · Answer 1 · 2014-07-28T21:13:24.793

You should assign the result of your reduction to a variable, or it will be lost.

You should also be sure to include the proper thrust headers for the functions you are using.

Reviewing the documentation for thrust::reduce we see that the reduction output type takes it's type from the types specified for the input iterators (ie. the input types must be convertible to the output type). Since your functor is mixing uchar4 and int types, thrust doesn't know how to convert.

There are probably a number of ways to do what you want. It seems evident that you are just wanting to sum every pixel component (R,G,B) of every pixel together.

Since we want all types to match during the reduction (input and output), one approach would be to use transform_reduce and convert the input types to int quantities, before doing the sum reduction (then yielding the desired int result).

This code shows the modifications, and compiles cleanly for me:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <GL/gl.h>
#include <GL/glut.h>
#include <cuda_gl_interop.h>
#include <GL/glext.h>
#include <GL/glx.h>
#include <thrust/device_ptr.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>

#define GET_PROC_ADDRESS( str ) glXGetProcAddress( (const GLubyte *)str )

static void HandleError( cudaError_t err, const char *file,  int line ) {
    if (err != cudaSuccess) {
            printf( "%s in %s at line %d\n", cudaGetErrorString( err ),  file, line );
            exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))



PFNGLBINDBUFFERARBPROC    glBindBuffer     = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers  = NULL;
PFNGLGENBUFFERSARBPROC    glGenBuffers     = NULL;
PFNGLBUFFERDATAARBPROC    glBufferData     = NULL;

#define     DIM    512

GLuint  bufferObj;
cudaGraphicsResource *resource;


struct transform_functor
{
  __host__ __device__
  int operator()(uchar4 data) const
  {
    return (int)data.x + (int)data.y + (int)data.z;
  }
};

// create a green/black pattern
__global__ void kernel( uchar4 *ptr ) {
// map from threadIdx/BlockIdx to pixel position
  int x = threadIdx.x + blockIdx.x * blockDim.x;
  int y = threadIdx.y + blockIdx.y * blockDim.y;
  int offset = x + y * blockDim.x * gridDim.x;

// now calculate the value at that position
  float fx = x/(float)DIM - 0.5f;
  float fy = y/(float)DIM - 0.5f;
  unsigned char   green = 128 + 127 * sin( abs(fx*100) - abs(fy*100) );

// accessing uchar4 vs unsigned char*
  ptr[offset].x = 0;
  ptr[offset].y = green;
  ptr[offset].z = 0;
  ptr[offset].w = 255;
}

static void draw_func( void ) {

  glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
  glutSwapBuffers();
}
static void sort_pixels(){
  cudaGraphicsMapResources( 1, &resource, NULL );
  uchar4* devPtr;
  size_t  size;

  cudaGraphicsResourceGetMappedPointer( (void**)&devPtr, &size, resource);

  thrust::device_ptr<uchar4> tptr = thrust::device_pointer_cast(devPtr);
  int pix_sum = thrust::transform_reduce(tptr, tptr+(DIM*DIM), transform_functor(), int(0), thrust::plus<int>());
  printf("sum = %d\n", pix_sum);
  cudaGraphicsUnmapResources( 1, &resource, NULL );
  draw_func();
}

static void key_func( unsigned char key, int x, int y ) {
  switch (key) {
    case 27:
        HANDLE_ERROR( cudaGraphicsUnregisterResource( resource ) );
        glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
        glDeleteBuffers( 1, &bufferObj );
        exit(0);
        break;
    case 32:
        sort_pixels();
        break;
    default:
        break;
  }
}



int main(int argc, char *argv[]) {

  cudaGLSetGLDevice( 0 );

  glutInit( &argc, argv );
  glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
  glutInitWindowSize( DIM, DIM );
  glutCreateWindow( "sort test" );

  glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
  glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
  glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
  glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

  glGenBuffers( 1, &bufferObj );
  glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
  glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4, NULL, GL_DYNAMIC_DRAW_ARB );


  cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone );


  cudaGraphicsMapResources( 1, &resource, NULL );
  uchar4* devPtr;
  size_t  size;

  cudaGraphicsResourceGetMappedPointer( (void**)&devPtr, &size, resource);

  dim3    grid(DIM/16,DIM/16);
  dim3    threads(16,16);
  kernel<<<grid,threads>>>( devPtr );
  cudaGraphicsUnmapResources( 1, &resource, NULL );

// set up GLUT and kick off main loop
  glutKeyboardFunc( key_func );
  glutDisplayFunc( draw_func );
  glutMainLoop();
}

The reduce functor you wrote simply sums everything together (R,G,B components of all pixels). If that is what you want, fine. I think you could compute something like "average pixel intensity" with that. If you want the "average color", then you probably want to average the individual components separately. For this, we would probably use transform_reduce, and the transform functor would convert the input uchar4 into a 3-tuple of int, and the reduce functor would sum the individual components of the 3-tuples together, producing a 3-tuple result of the sum of the R,G, and B components, one in each int of the 3-tuple. Here's a modified version of the above code that does this:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <GL/gl.h>
#include <GL/glut.h>
#include <cuda_gl_interop.h>
#include <GL/glext.h>
#include <GL/glx.h>
#include <thrust/device_ptr.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/tuple.h>


#define GET_PROC_ADDRESS( str ) glXGetProcAddress( (const GLubyte *)str )

static void HandleError( cudaError_t err, const char *file,  int line ) {
    if (err != cudaSuccess) {
            printf( "%s in %s at line %d\n", cudaGetErrorString( err ),  file, line );
            exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))



PFNGLBINDBUFFERARBPROC    glBindBuffer     = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers  = NULL;
PFNGLGENBUFFERSARBPROC    glGenBuffers     = NULL;
PFNGLBUFFERDATAARBPROC    glBufferData     = NULL;

#define     DIM    512

GLuint  bufferObj;
cudaGraphicsResource *resource;

typedef thrust::tuple<int, int, int> tpl3;

struct transform_functor
{
  __host__ __device__
  tpl3 operator()(uchar4 data) const
  {
    tpl3 result;
    result.get<0>() = (int)data.x;
    result.get<1>() = (int)data.y;
    result.get<2>() = (int)data.z;
    return result;
  }
};

struct reduce_functor
{
  __host__ __device__
  tpl3 operator()(tpl3 left, tpl3 right) const
  {
    tpl3 result;
    result.get<0>() = left.get<0>() + right.get<0>();
    result.get<1>() = left.get<1>() + right.get<1>();
    result.get<2>() = left.get<2>() + right.get<2>();
    return result;
  }
};

// create a green/black pattern
__global__ void kernel( uchar4 *ptr ) {
// map from threadIdx/BlockIdx to pixel position
  int x = threadIdx.x + blockIdx.x * blockDim.x;
  int y = threadIdx.y + blockIdx.y * blockDim.y;
  int offset = x + y * blockDim.x * gridDim.x;

// now calculate the value at that position
  float fx = x/(float)DIM - 0.5f;
  float fy = y/(float)DIM - 0.5f;
  unsigned char   green = 128 + 127 * sin( abs(fx*100) - abs(fy*100) );

// accessing uchar4 vs unsigned char*
  ptr[offset].x = 0;
  ptr[offset].y = green;
  ptr[offset].z = 0;
  ptr[offset].w = 255;
}

static void draw_func( void ) {

  glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
  glutSwapBuffers();
}
static void sort_pixels(){
  cudaGraphicsMapResources( 1, &resource, NULL );
  uchar4* devPtr;
  size_t  size;

  cudaGraphicsResourceGetMappedPointer( (void**)&devPtr, &size, resource);

  thrust::device_ptr<uchar4> tptr = thrust::device_pointer_cast(devPtr);
  tpl3 my_init;
  my_init.get<0>() = 0;
  my_init.get<1>() = 0;
  my_init.get<2>() = 0;
  tpl3 pix_sum = thrust::transform_reduce(tptr, tptr+(DIM*DIM), transform_functor(), my_init, reduce_functor());
  printf("avg red = %f\n", (float)(pix_sum.get<0>())/(float)(DIM*DIM));
  printf("avg grn = %f\n", (float)(pix_sum.get<1>())/(float)(DIM*DIM));
  printf("avg blu = %f\n", (float)(pix_sum.get<2>())/(float)(DIM*DIM));
  cudaGraphicsUnmapResources( 1, &resource, NULL );
  draw_func();
}

static void key_func( unsigned char key, int x, int y ) {
  switch (key) {
    case 27:
        HANDLE_ERROR( cudaGraphicsUnregisterResource( resource ) );
        glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
        glDeleteBuffers( 1, &bufferObj );
        exit(0);
        break;
    case 32:
        sort_pixels();
        break;
    default:
        break;
  }
}



int main(int argc, char *argv[]) {

  cudaGLSetGLDevice( 0 );

  glutInit( &argc, argv );
  glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
  glutInitWindowSize( DIM, DIM );
  glutCreateWindow( "sort test" );

  glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
  glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
  glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
  glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

  glGenBuffers( 1, &bufferObj );
  glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
  glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4, NULL, GL_DYNAMIC_DRAW_ARB );


  cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone );


  cudaGraphicsMapResources( 1, &resource, NULL );
  uchar4* devPtr;
  size_t  size;

  cudaGraphicsResourceGetMappedPointer( (void**)&devPtr, &size, resource);

  dim3    grid(DIM/16,DIM/16);
  dim3    threads(16,16);
  kernel<<<grid,threads>>>( devPtr );
  cudaGraphicsUnmapResources( 1, &resource, NULL );

// set up GLUT and kick off main loop
  glutKeyboardFunc( key_func );
  glutDisplayFunc( draw_func );
  glutMainLoop();
}

Using Thrust's reduce operator with Pixel uchar4 data error

1 Answers1

Linked