I have the following problem (keep in mind that I am fairly new to programming with CUDA).
I have a class called vec3f
that is just like the float3
data type but with overloaded operators, and other vector functions. These functions are prefixed with __device__ __host__
. Then, in my kernel I do a nested for loop over block_x
and block_y
indices and do something like,
//set up shared memory block
extern __shared__ vec3f share[];
vec3f *sh_pos = share;
vec3f *sh_velocity = &sh_pos[blockDim.x*blockDim.y];
sh_pos[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].position();
sh_velocity[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].velocity();
__syncthreads();
In the above code, oldParticles
is a pointer to a class called particles
that is being passed to the kernel. OldParticles
is actually an underlying pointer of a thrust::device_vector<particle>
(I'm not sure if this has something to do with it). Everything compiles okay but when I run I get the error
libc++abi.dylib: terminate called throwing an exception
Abort trap: 6
Thanks for the replies. I think the error had to do with me not allocating room for the arguments being passed to my kernel. Doing the following in my host code fixed this error,
particle* particle_ptrs[2];
particle_ptrs[0] = thrust::raw_pointer_cast(&d_old_particles[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&d_new_particles[0]);
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) ) );
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) ) );
The kernel call is then,
force_kernel<<< grid,block,sharedMemSize >>>(particle_ptrs[0],particle_ptrs[1],time_step);
The issue that I am having now seems to be that I can't get data copied back to the host from the device. I think this has to do with me not being familiar with Thrust.
I'm doing a series of copies as follows,
//make a host vector assume this is initialized
thrust::host_vector<particle> h_particles;
thrust::device_vector<particle> d_old_particles, d_new_particles;
d_old_particles = h_particles;
//launch kernel as shown above
//with thrust vectors having been casted into their underlying pointers
//particle_ptrs[1] gets modified and so shouldnt d_new_particles?
//copy back
h_particles = d_new_particles;
So I guess my question is, can I modify a Thrust device_vector
in a kernel (in this case particle_pters[0]
) save the modification to another Thrust device_vector
in the kernel (in this case particle_pters[1]
) and then once I exit from the kernel, copy that to a host_vector
?
I still can't get this to work. I made a shorter example where I am having the same problem,
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "vec3f.h"
const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;
using namespace std;
//particle class
class particle {
public:
particle() :
_velocity(vec3f(0,0,0)), _position(vec3f(0,0,0)), _density(0.0) {
};
particle(const vec3f& pos, const vec3f& vel) :
_position(pos), _velocity(vel), _density(0.0) {
};
vec3f _velocity;
vec3f _position;
float _density;
};
//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);
//global thrust vectors
thrust::host_vector<particle> h_parts;
thrust::device_vector<particle> old_parts, new_parts;
particle* particle_ptrs[2];
int main() {
//load host vector
for (int i =0; i<max_particles; i++) {
h_parts.push_back(particle(vec3f(0.5,0.5,0.5),vec3f(10,10,10)));
}
particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) );
cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) );
//copy host particles to old device particles...
old_parts = h_parts;
//kernel block and grid dimensions
dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
dim3 grid(int(sqrt(float(max_particles) / (float(block.x*block.y)))), int(sqrt(float(max_particles) / (float(block.x*block.y)))), 1);
kernel_func<<<block,grid>>>(particle_ptrs[0],particle_ptrs[1],dt);
//copy new device particles back to host particles
h_parts = new_parts;
for (int i =0; i<max_particles; i++) {
particle temp1 = h_parts[i];
cout << temp1._position << endl;
}
//delete thrust device vectors
old_parts.clear();
old_parts.shrink_to_fit();
new_parts.clear();
new_parts.shrink_to_fit();
return 0;
}
//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
//get array position for 2d grid...
unsigned int arr_pos = y*blockDim.x*gridDim.x + x;
new_parts[arr_pos]._velocity = old_parts[arr_pos]._velocity * 10.0 * dt;
new_parts[arr_pos]._position = old_parts[arr_pos]._position * 10.0 * dt;
new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
}
So the host_vector
has an initial position of (0.5,0.5,0.5)
for all 64 particles. Then the kernel attempts to multiply that by 10 to give (5,5,5)
as the position for all particles. But I don't see this when I cout
the data. It is still just (0.5,0.5,0.5)
. Is there a problem with how I am allocating memory? Is there a problem with the lines:
//copy new device particles back to host particles
h_parts = new_parts;
What could be the issue? Thank you.