My program creates 3 Matrices, A, B, and C. it then stores the dot product of A dot B in C. I'm trying to compare the time it takes to do this on the CPU vs GPU. I've written two functions CPUDot and GPUDot. CPUDot computes the dot product in that function and GPUDot calls a kernel function.
The program works fine for relativly small values of size, but once size is around 650-675 it doesn't work. The kernal still executes and it still prints the time that it took, but it crashes in subtract(C, D, diff);
When it does this I added a printf statement printing the index to see where it stops working and it can't access the first index of one of the arrays.
All of the matricies are the same size m x n.
I'm not sure if this is due to my specific GPU's memory size. I'm using a GTX 965m, and I'm pretty sure it has 1024 CUDA cores and 2gb VRam.
#include <stdio.h>
#define MAX_THREADS 1024
#define MAX_BLOCKS 65535
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void Dot(double *A, double *B, double *C, int m, int n, int h){
int fi = blockIdx.x;
int fm = gridDim.x;
int si = threadIdx.x;
int sm = blockDim.x;
int index = fi * sm + si;
int stride = fm * sm;
//printf("Index: %d\nStride: %d\n", index, stride);
for(int i = index; i < m*n; i += stride){
// Pretend this is a two dimensional array. Get [x][y] parameters for C
int x = i % n;
int y = i / n;
double counter = 0;
for(int j = 0; j < h; ++j){
// Pretending this is a t dimensional array get [x][y] parameters for A and B
int ax = j;
int ay = y;
int bx = x;
int by = j;
// Convert [x][y] -> [i] since data is stored as single dimensional array for A and B
int ai = ay * h + ax;
int bi = by * n + bx;
//printf("ai, bi, ci, %d, %d\n", ai, bi);
double a = A[ai];
double b = B[bi];
counter += a * b;
}
C[i] = counter;
}
}
class Matrix{
public:
int width;
int height;
double *elements;
Matrix(int m, int n){
this->width = n;
this->height = m;
gpuErrchk(cudaMallocManaged(&this->elements, m * n * sizeof(double)));
for(int i = 0; i < m*n; ++i){
this->elements[i] = i;
}
}
~Matrix(){
gpuErrchk(cudaFree(this->elements));
}
void populate(double value){
for(int i = 0; i < this->width * this->height; ++i){
this->elements[i] = value;
}
}
void print(){
int i = 0;
for(int y = 0; y < this->height; ++y){
for(int x = 0; x < this->width; ++x){
printf("%lf ", this->elements[i]);
++i;
}
printf("\n");
}
}
};
void GPUDot(Matrix &A, Matrix &B, Matrix &C){
double *aData = A.elements;
double *bData = B.elements;
double *cData = C.elements;
int threads = A.height;
if(threads > MAX_THREADS){
threads = MAX_THREADS;
}
int blocks = A.width;
if(blocks > MAX_BLOCKS){
blocks = MAX_BLOCKS;
}
Dot<<<1, 1>>>(aData, bData, cData, A.height, B.width, A.width);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
void CPUDot(const Matrix &A, const Matrix &B, Matrix &C){
// Rectangle matricies are stored as single dimension array
// Pretending it is stored as a two dimensional array x, y would be the values array[y][x]
for(int y = 0; y < C.height; ++y){
for(int x = 0; x < C.width; ++x){
// Calculate the actual index given two dimensional parameters
int ci = y * C.width + x;
// Get new x, y parameters for the A matrix and B matrix
int ay = y;
int bx = x;
C.elements[ci] = 0;
for(int c = 0; c < A.width; ++c){
int ax = c;
int by = c;
// Calculate the actual index for A and B given x, y
int ai = ay * A.height + ax;
int bi = by * B.width + bx;
C.elements[ci] += A.elements[ai] * B.elements[bi];
}
}
}
}
void subtract(const Matrix &A, const Matrix &B, Matrix &C){
for(int i = 0; i < A.height * A.width; ++i){
C.elements[i] = A.elements[i] - B.elements[i];
}
}
double sum(const Matrix &A){
double sum = 0;
for(int i = 0; i < A.height * A.width; ++i){
sum += A.elements[i];
}
return sum;
}
int main(){
int size = 100;
printf("Dotting two matricies of size: %d\n", size);
Matrix A(size, size);
Matrix B(size, size);
Matrix C(size, size);
Matrix D(size, size);
Matrix diff(size, size);
A.populate(1);
B.populate(2);
C.populate(0);
time_t t;
//A.print();
//B.print();
t = clock();
CPUDot(A, B, C);
t = clock() - t;
printf("CPU Dotting took %lf ms\n", ((float)t / CLOCKS_PER_SEC) * 1000);
//C.print();
//A.populate(3);
t = clock();
GPUDot(A, B, D);
t = clock() - t;
printf("GPU Dotting took %lf ms\n", ((float)t / CLOCKS_PER_SEC) * 1000);
//D.print();
subtract(C, D, diff);
printf("Subtracted the matricies\n");
double error = sum(diff);
printf("Error: %lf\n", error);
return 0;
}
I know this isn't the best way to go about this problem - I should be utilizing the first parameter in Dot<<<1, threads>>>(parameters); but I was just trying to see if this worked and I don't know why it doesn't.
edit: I added proper cuda error checking I updated the code to show the changes I've made
I not get a gpuassert which reads
GPUassert: unspecified launch failure Matrix.cu 98
I also read on another post that a "unspecified launch failure" is almost always a segfault, but I would assume that if I had an indexing issue it would always be present not just when the size of the matrix is too large. I also changed threads to 1 so the numbers inside the brackets are always <<<1, 1>>> so the index and stride are always 0, 1 respectively