I recently started CUDA programming. I followed tutorials and guides about it, and made a first program copied from the original example ( here pg. 25 ), which basically use the GPU to add two vectors. I can compile it but the result I get isn't the one I expected.
Here is my code :
#include <iostream>
#include <random>
using namespace std;
//Size of vector
#define N 16
//Kernel for adding vector
__global__ void add(int* a, int* b, int* c){
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
//Functions
void printArray(int* a){
cout << "[ ";
for(int i = 0; i < N; i++){
cout << *(a+i) << " ";
} cout << "]" << endl;
}
void fillRandomValue(int* a, int size, int range){
for(int i = 0; i < size; i++){
*(a+i) = rand() % range;
}
}
int main(void){
//Declaration of vector for host and device : a for host ; d_a for device a ;
int* a,* b,* c;
int* d_a,* d_b,*d_c;
int size = N * sizeof(int);
//Allocate memory into the device
cudaMalloc((void** )&d_a, size);
cudaMalloc((void** )&d_b, size);
cudaMalloc((void** )&d_c, size);
//Declaring 3 array
a = new int[size];
b = new int[size];
c = new int[size];
//Randomn array
//Initialize randomn seed
srand (time(NULL));
fillRandomValue(a, N, 10);
fillRandomValue(b, N, 10);
//Copy host to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
//Adding vector
add<<<N,1>>>(d_a, d_b, d_c);
//Get back the result
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
//Output to console
cout << "a = "; printArray(a); cout << endl;
cout << "b = "; printArray(b); cout << endl;
cout << "a + b = "; printArray(c); cout << endl;
//Free memory
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
The problem is I get the following result:
a = [ 5 5 7 2 9 5 3 5 4 5 2 9 6 7 4 9 ]
b = [ 9 6 9 8 9 7 5 6 6 6 2 8 7 3 1 2 ]
a + b = [ -971240544 44196 -2117203120 54617 1031799296 65038 1031800320 65038 -2117199280 54617 -1009853744 44196 -1048817168 44196 -1307968288 54865 ]
I tried to see if those weren't address, but no. I also tried to copy the straight code from the link above but I got no luck.
I can reproduce this behavior as much as I want .