I am learning some basic CUDA programming. I am trying to initialize an array on the Host with host_a[i] = i
. This array consists of N = 128 integers. I am launching a kernel with 1 block and 128 threads per block, in which I want to square the integer at index i
.
My questions are:
How do I come to know whether the kernel gets launched or not? Can I use
printf
within the kernel?The expected output for my program is a space-separated list of squares of integers -
1 4 9 16 ...
.
What's wrong with my code, since it outputs 1 2 3 4 5 ...
Code:
#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <cuda.h>
const int N = 128;
__global__ void f(int *dev_a) {
unsigned int tid = threadIdx.x;
if(tid < N) {
dev_a[tid] = tid * tid;
}
}
int main(void) {
int host_a[N];
int *dev_a;
cudaMalloc((void**)&dev_a, N * sizeof(int));
for(int i = 0 ; i < N ; i++) {
host_a[i] = i;
}
cudaMemcpy(dev_a, host_a, N * sizeof(int), cudaMemcpyHostToDevice);
f<<<1, N>>>(dev_a);
cudaMemcpy(host_a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
for(int i = 0 ; i < N ; i++) {
printf("%d ", host_a[i]);
}
}