I'm writing a program in CUDA that given a static matrix, it is filled with a given value, but I don't know why it gives me segfault... I think the line that gives it is when I try to copy the matrix back on the host, but I can't figure out a different way to do it.
#include <cuda.h>
#include <iostream>
using namespace std;
__global__ void initKernel(float A[][65536], int n, int m, float value){
int i = blockDim.x*blockIdx.x + threadIdx.x;
int x,y;
if(i<n*m){
x=i/m;
y=i%m;
}
A[x][y]=value;
}
void matrixInit(float A[][65536], int n, int m, float value){
int size=(n*m)*sizeof(float);
int block_size = 32, number_of_blocks = ceil((n*m)/block_size);
float (*d_A)[65536];
cudaMalloc((void**)&d_A, size);
cudaMemcpy(d_A, A, (n*m)*sizeof(float), cudaMemcpyHostToDevice);
initKernel<<<number_of_blocks, block_size>>>(d_A, n,m,value);
cudaMemcpy(A,d_A,size,cudaMemcpyDeviceToHost);
}
int main(){
int n=4096;
int m=65536;
float A[4096][65536];
matrixInit(A,n,m,1.0);
}