I wrote two programs to add vectors of length 1024 and run it 1024 to see which worked faster. One of them was cuda based and another was not. I was expecting the cuda based to do better, but it didn't. Below are both the programs. Please see if I am doing something wrong here? Do I need longer arrays? Any longer array in cuda is producing a cudaError.
Non-cuda code:
#include <iostream>
#include <cstdlib>
#include <chrono>
#include <fstream>
using namespace std;
int main()
{
const int addLen = 1024;
const unsigned long addCount = 1024;
double *timeStops = new double[addLen];
int *arr1 = new int[addCount];
int *arr2 = new int[addCount];
int *arr3 = new int[addCount];
int max = 100;
for (int j = 0; j < addLen; j++) {
auto tstart = chrono::high_resolution_clock::now();
for (unsigned long i = 0; i < addCount; i++) {
arr1[i] = rand() % max;
arr2[i] = rand() % max;
}
for (unsigned long i = 0; i < addCount; i++) {
arr3[i] = arr1[i] + arr2[i];
}
auto tend = chrono::high_resolution_clock::now();
timeStops[j] = (tend - tstart).count();
}
delete[] arr1, arr2, arr3;
ofstream outdata;
outdata.open("outdata.dat");
for (int j = 0; j < addLen; j++) {
outdata << timeStops[j] << endl;
}
outdata.close();
delete[] timeStops;
}
Cuda code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <chrono>
#include <fstream>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
using namespace std;
int main()
{
const int addLen = 1024;
const unsigned long arraySize = 10;
double* timeStops = new double[addLen];
int* a = new int[arraySize];
int* b = new int[arraySize];
int* c = new int[arraySize];
for (int j = 0; j < addLen; j++) {
for (unsigned long i = 0; i < arraySize; i++) {
a[i] = rand() % 100;
b[i] = rand() % 100;
}
auto tstart = chrono::high_resolution_clock::now();
addWithCuda(c, a, b, arraySize);
auto tend = chrono::high_resolution_clock::now();
timeStops[j] = (tend - tstart).count();
}
cudaDeviceReset();
delete[] a, b, c;
ofstream outdata;
outdata.open("outcudadata.dat");
for (int j = 0; j < addLen; j++) {
outdata << timeStops[j] << endl;
}
outdata.close();
delete[] timeStops;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaSetDevice(0);
cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaMalloc((void**)&dev_a, size * sizeof(int));
cudaMalloc((void**)&dev_b, size * sizeof(int));
cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
cudaDeviceSynchronize();
cudaError_t cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}