I'm working on one tutorial for the vector_reduction algorithm implemented using CUDA C++ API and I'm struggling because I really don't understand what I'm doing wrong because the result is (device: 4386.000000 host: 260795.000000)
The code that I'm using is the following (the problem size is fixed at 512).
EDIT: Unfortunately the problem has not been solved and I still get the same result. I have updated the code providing the complete code. The goal is the same, to sum all the elements of an array of float of 512 elements.
#define NUM_ELEMENTS 512
__global__ void reduction(float *g_data, int n)
{
__shared__ float s_data[NUM_ELEMENTS];
int tid = threadIdx.x;
int index = tid + blockIdx.x*blockDim.x;
s_data[tid] = 0.0;
if (index < n){
s_data[tid] = g_data[index];
}
__syncthreads();
for (int s = 2; s <= blockDim.x; s = s * 2){
if ((tid%s) == 0){
s_data[tid] += s_data[tid + s / 2];
}
__syncthreads();
}
if (tid == 0){
g_data[blockIdx.x] = s_data[tid];
}
}
// includes, system
#include <cuda_runtime.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
// includes, kernels
#include "vector_reduction_kernel.cu"
// For simplicity, just to get the idea in this MP, we're fixing the problem size to 512 elements.
#define NUM_ELEMENTS 512
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
float computeOnDevice(float* h_data, int array_mem_size);
extern "C"
void computeGold( float* reference, float* idata, const unsigned int len);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
cudaSetDevice(0);
runTest( argc, argv);
return EXIT_SUCCESS;
}
////////////////////////////////////////////////////////////////////////////////
//! Run naive scan test
////////////////////////////////////////////////////////////////////////////////
void runTest( int argc, char** argv)
{
int num_elements = NUM_ELEMENTS;
const unsigned int array_mem_size = sizeof( float) * num_elements;
// Allocate host memory to store the input data
float* h_data = (float*) malloc( array_mem_size);
// initialize the input data on the host to be integer values
// between 0 and 1000
for( unsigned int i = 0; i < num_elements; ++i)
h_data[i] = floorf(1000*(rand()/(float)RAND_MAX));
// Function to compute the reference solution on CPU using a C sequential version of the algorithm
// It is written in the file "vector_reduction_gold.cpp". The Makefile compiles this file too.
float reference = 0.0f;
computeGold(&reference , h_data, num_elements);
// Function to compute the solution on GPU using a call to a CUDA kernel (see body below)
// The kernel is written in the file "vector_reduction_kernel.cu". The Makefile also compiles this file.
float result = computeOnDevice(h_data, num_elements);
// We can use an epsilon of 0 since values are integral and in a range that can be exactly represented
float epsilon = 0.0f;
unsigned int result_regtest = (abs(result - reference) <= epsilon);
printf( "Test %s\n", (1 == result_regtest) ? "Ok." : "No.");
printf( "device: %f host: %f\n", result, reference);
// cleanup memory
free( h_data);
}
// Function to call the CUDA kernel on the GPU.
// Take h_data from host, copies it to device, setup grid and thread
// dimensions, excutes kernel function, and copy result of scan back
// to h_data.
// Note: float* h_data is both the input and the output of this function.
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
// Copy from host memory to device memory
cudaMemcpy((void**)&d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
//int threads = (num_elements/2) + num_elements%2;
int threads = (num_elements);
// Invoke the kernel
reduction<<< 1 ,threads >>>(d_data,num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaDeviceReset();
return result;
}
float computeOnDevice(float* h_data, int num_elements)
{
float* d_data = NULL;
float result;
// Memory allocation on device side
cudaMalloc((void**)&d_data, sizeof(float)*num_elements);
// Copy from host memory to device memory
cudaMemcpy(d_data, h_data, num_elements * sizeof(float), cudaMemcpyHostToDevice );
int threads = (num_elements);
// Invoke the kernel
reduction<<< 1 ,threads >>>(d_data,num_elements);
// Copy from device memory back to host memory
cudaMemcpy(&result, d_data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaDeviceReset();
return result;
}