I am trying to get a better grasp of memory management in cuda. There is Something that is just now occurring to me as a major lack of understanding. How do kernels access values that, as I understand it, should be in host memory.
When vectorAdd() is called, it runs the function on the device. But only the elements are stored on the device memory. the length of the vectors are stored on the host. How is it that the kernel does not exit with an error from trying to access foo.length, something that should be on the host.
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct{
float *elements;
int length;
}vector;
__global__ void vectorAdd(vector foo, vector bar){
int idx = threadIdx.x + blockDim.x * blockId.x.x;
if(idx < foo.length){ //this is the part that I do not understand
foo.elements[idx] += bar.elements[idx];
}
}
int main(void){
vector foo, bar;
foo.length = bar.length = 50;
cudaMalloc(&(foo.elements), sizeof(float)*50);
cudaMalloc(&(bar.elements), sizeof(float)*50);
//these vectors are empty, so adding is just a 0.0 += 0.0
int blocks_per_grid = 10;
int threads_per_block = 5;
vectorAdd<<<blocks_per_grid, threads_per_block>>>(foo, bar);
return 0;
}