I have a simple task, that I can't seem to solve. I got 2 unidimensional arrays (called vectors) consisting of 10 elements. Each element of the array contains a random positive number. The goal is to use CUDA to calculate the sum of those 2 arrays of each index number (in other words: Vector Sum[0] = Vector A[0] + Vector B[0], then the same with 1,2...10)
Here is my code (kernel.cu). I know I am using the "float-anything" variable names for integer data types. That's because I initially planned to do it on float data types but I could not get the project working at all as a result of data type incompatibilities. Correct me if it's actually possible using float data types for this.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
__global__ void vecAdd_kernel(int *floatAr1gpu, int *floatAr2gpu, int *floatSumGPU, int The_N){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < The_N) floatSumGPU[i] = floatAr1gpu[i] + floatAr2gpu[i];
}
int main()
{
const unsigned int arraySize = 10;
int floatArray1[arraySize];
int floatArray2[arraySize];
int *floatAr1gpu = 0;
int *floatAr2gpu = 0;
int floatSum[arraySize];
int *floatSumGPU = 0;
for (int c = 0; c < arraySize; c++) {
floatArray1[c] = (rand() % 10)+1;
floatArray2[c] = (rand() % 10)+1;
}
//Put the data into the GPU now
// V--- This is allocating GPU memory under that name and Variable
cudaMalloc((void **)&floatArray1, sizeof(float)*arraySize);
cudaMalloc((void **)&floatArray2, sizeof(float)*arraySize);
cudaMalloc((void **)&floatSum, sizeof(float)*arraySize);
// CPU Memory GPU Mem Array size Method
cudaMemcpy(floatArray1, floatAr1gpu, sizeof(float)*arraySize, cudaMemcpyHostToDevice);
cudaMemcpy(floatArray2, floatAr2gpu, sizeof(float)*arraySize, cudaMemcpyHostToDevice);
// execute
// grid size, block size
vecAdd_kernel << < 1, arraySize >> > (floatArray1, floatArray2, floatSum, arraySize);
//Copy data back from GPU to RAM
// GPU Memory CPU Mem Array size Method
cudaMemcpy(floatSumGPU, floatSum, sizeof(float)*arraySize, cudaMemcpyDeviceToHost);
// clean up
cudaFree(floatArray1);
cudaFree(floatArray2);
cudaFree(floatSum);
for (int cc = 0; cc < arraySize; cc++) {
std::cout << "Result of array number " << cc << " = " << floatSum[cc] << std::endl;
}
std::cout << "Done. Press any key to exit." << std::endl;
char key = std::cin.get();
return 0;
}
This is what I get as a result: Program result
This is what I want to achieve (using CUDA): Program result
What's wrong with the code? I placed a break-point to check that array here: array contents