Hi i am making my first steps in CUDA technology but i think i do not get it right.
I am trying to multiply two dimensional array by vector but something is not working
Here is the code I am trying to figure out:
#include <stdio.h>
#include <stdlib.h>
#define N 2
__global__ void Multiply(int A[N][N], int B[N], int C[N]){
int i = threadIdx.x;
int j = threadIdx.y;
int sum = A[i][j] * B[j];
C[i]= sum;
printf("%d,%d ", sum, C[i]);
}
int main(){
int A[N][N] ={ {1,1},
{1,1}
};
int B[N] = {4,6};
int C[N] = {0,0};
int (*aA)[N], (*aB), (*aC);
cudaMalloc((void**)&aA, (N*N)*sizeof(int));
cudaMalloc((void**)&aB, (N)*sizeof(int));
cudaMalloc((void**)&aC, (N)*sizeof(int));
cudaMemcpy(aA, A, (N*N)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(aB, B, (N)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(aC, C, (N)*sizeof(int), cudaMemcpyHostToDevice);
int numBlocks = 1;
dim3 threadsPerBlock(N,N);
Multiply<<<numBlocks,threadsPerBlock>>>(aA,aB,aC);
cudaMemcpy(C, aC, (N)*sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(aA);
cudaFree(aB);
cudaFree(aC);
printf("\n");
system("pause");
}
in this case the Output is : 4,6 4,6 6,6 6,6 so basically the sum i giving the right values but C[i] is returning always 6 although there is sum value assigned to it.
What am I doing wrong?