I recently have been playing around with CUDA, and was hoping to try out the unified memory model. I tried playing with sample code, and strangely, when launching the kernel, no values seemed to be updating. Modifying unified data from the host works fine, yet kernels launched simply won't modify the unified data.
My card is a GTX 770 with 4GB of memory. I'm running Arch Linux, kernel 3.14-2, using GCC 4.8 to compile my samples. I'm setting the compute arch to sm_30, and activative -m64 flag
Here's one sample that I was playing with. X[0] and X[1] always evaluate to 0, even when the kernel launches.
#include<stdio.h>
#include <cuda.h>
__global__ void kernel(int* x){
x[threadIdx.x] = 2;
}
int main(){
int* x;
cudaMallocManaged(&x, sizeof(int) * 2);
cudaError_t error = cudaGetLastError();
printf("%s\n", error);
x[0] = 0;
x[1] = 0;
kernel<<<1, 2>>>(x);
cudaDeviceSynchronize();
printf("result = %d\n", x[1]);
cudaFree(x);
return 0;
}
Another sample is this:
__global__ void adjacency_map_init_gpu(adjacency_map_t* map){
int row = threadIdx.y + blockIdx.y * blockDim.y;
int col = threadIdx.x + blockIdx.x * blockDim.x;
int i = row * map->width + col;
max(i, 0);
min(i, map->width * map->height);
map->connections[i] = 0;
}
__global__ void adjacency_map_connect_gpu(edge_t* edges, int num_edges, adjacency_map_t* map){
int i = threadIdx.x + (((gridDim.x * blockIdx.y) + blockIdx.x)*blockDim.x);
max(i, 0);
min(i, num_edges);
int n_start = edges[i].n_start;
int n_end = edges[i].n_end;
int map_index = n_start * map->width + n_end;
map->connections[map_index] = 1;
printf("%d new value: %d\n", map_index, map->connections[map_index]);
}
adjacency_map_t* adjacency_map_init(int num_nodes, edge_t* edges, int num_edges){
adjacency_map_t *map;// = (adjacency_map_t*)malloc(sizeof(adjacency_map_t));
cudaMallocManaged(&map, sizeof(adjacency_map_t));
cudaMallocManaged(&(map->connections), num_nodes * num_nodes * sizeof(int));
//map->connections = (int*)malloc(sizeof(int) * num_nodes * num_nodes);
map->width = num_nodes;
map->height = num_nodes;
map->stride = 0;
//GPU stuff
// adjacency_map_t *d_map;
// int* d_connections;
// cudaMalloc((void**) &d_map, sizeof(adjacency_map_t));
// cudaMalloc((void**) &d_connections, num_nodes * num_nodes * sizeof(int));
// cudaMemcpy(d_map, map, sizeof(adjacency_map_t), cudaMemcpyHostToDevice);
// cudaMemcpy(d_connections, map->connections, num_nodes * num_nodes, cudaMemcpyHostToDevice);
//cudaMemcpy(&(d_map->connections), &d_connections, sizeof(int*), cudaMemcpyHostToDevice);
// edge_t* d_edges;
// cudaMalloc((void**) &d_edges, num_edges * sizeof(edge_t));
// cudaMemcpy(d_edges, edges, num_edges * sizeof(edge_t), cudaMemcpyHostToDevice);
adjacency_map_init_gpu<<<1, 3>>>(map);
cudaDeviceSynchronize();
//adjacency_map_connect_gpu<<<1, 3>>>(edges, num_edges, map);
cudaDeviceSynchronize();
// cudaMemcpy(map, d_map, sizeof(adjacency_map_t), cudaMemcpyDeviceToHost);
//Synchronize everything
// cudaFree(map);
// cudaFree(edges);
return map;
}
Basically, I can access all the elements in the original structure on the host for the second snippet of code. Once I try to launch a kernel function, however, the pointer becomes inaccessible (at least, tested from gdb), and the entire object's data is inaccessible. The only portion of the edges and the map pointer I can still see after the first kernel launch are their respective locations.
Any help would be greatly appreciated! Thanks so much!