I got two functions: The add_cpu function works fine, but the add_gpu function does not.
I tried to check sum options on my GPU driver Software and read my code over and over again. I tried the exact same code on an other machine and it worked fine. The checkError result on current machine is 1, what it shouldn't be. And checkError result on my Laptop is 0, what is correct. Does anyone have any suggestion of what is the problem with the graphic card or the system? I have no clue what's the problem here. Did I miss some sort of option?
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <math.h>
#define out std::cout <<
#define end << std::endl
__global__
void add_gpu( int n, float* x, float* y ) {
for ( int i = 0; i < n; i++ ) y[i] = x[i] + y[i];
}
void add_cpu( int n, float* x, float* y ) {
for ( int i = 0; i < n; i++ ) y[i] = x[i] + y[i];
}
void init( int n, float* x, float* y ) {
for ( int i = 0; i < n; i++ ) {
x[i] = 1.0f;
y[i] = 2.0f;
}
}
int checkError( int n, float f, float* y ) {
float c = 0.0f;
for ( int i = 0; i < n; i++ ) c = fmax( c, fabs( y[i] - f ) );
return c;
}
void print( int n, float* obj, char* str = "obj: " ) {
out str << obj[0];
for ( int i = 1; i < n; i++ ) out ", " << obj[i];
out "" end;
}
int main( ) {
int n = 1 << 5;
float* x, * y;
float error = 0.0f;
cudaMallocManaged( &x, n * sizeof( float ) );
cudaMallocManaged( &y, n * sizeof( float ) );
init( n, x, y );
print( n, x, "x" );
print( n, y, "y" );
add_gpu<< <1, 1 >> > ( n, x, y );
//add_cpu(n, x, y);
cudaDeviceSynchronize( );
print( n, y, "y" );
error = checkError( n, 3.0f, y );
out "error: " << error end;
cudaFree( x );
cudaFree( y );
return 0;
}