I wrote a pretty simple Cuda Program. I would like to assign values to a matrix in device memory. Then I want to copy the values to the host and display them. The program I wrote does not work. But I don't know why. I tried to figure out what I do wrong by displaying the status with cout, but even this does not work, so I am thinking that the main function is not starting.
Does anyone know what's the problem?
Here is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
const int N = 1024;
__global__ void matrix(float *d_A)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = col + row * N;
if (col < N && row < N)
{
d_A[index] = 255;
}
}
int main()
{
std::cout << "Programm begins";
float A[N * N];
float d_A[N * N];
cudaMalloc((void**)&d_A, (N * N)*sizeof(float));
std::cout << "Matrizes allocated";
std::cout << A[0] << " , " << A[1] << " , " << A[2] << " , " << A[3] << " , " << A[4] << " , " << A[5] << "\n";
std::cout << A[1024] << " , " << A[1025] << " , " << A[1026] << " , " << A[1027] << " , " << A[1028] << " , " << A[1029] << "\n";
matrix << <1024, 1024 >> >(d_A);
std::cout << "Wrote Matrix to local device memory";
std::cout << d_A[0] << " , " << d_A[1] << " , " << d_A[2] << " , " << d_A[3] << " , " << d_A[4] << " , " << d_A[5] << "\n";
std::cout << d_A[1024] << " , " << d_A[1025] << " , " << d_A[1026] << " , " << d_A[1027] << " , " << d_A[1028] << " , " << d_A[1029] << "\n";
cudaMemcpy(A, d_A, N * N * sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "Wrote Matrix to host memory";
std::cout << A[0] << " , " << A[1] << " , " << A[2] << " , " << A[3] << " , " << A[4] << " , " << A[5] << "\n";
std::cout << A[1024] << " , " << A[1025] << " , " << A[1026] << " , " << A[1027] << " , " << A[1028] << " , " << A[1029] << "\n";
return 0;
}