I have a Python code (for implementing RayTracing) that I'm running in parallel with PyCuda.
import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
from stl import mesh
import time
my_mesh = mesh.Mesh.from_file('test_solid_py.stl')
n = my_mesh.normals
v0 = my_mesh.v0
v1 = my_mesh.v1
v2 = my_mesh.v2
v0_x = v0[:,0]
v0_x = np.ascontiguousarray(v0_x)
v0_y = v0[:,1]
v0_y = np.ascontiguousarray(v0_y)
v0_z = v0[:,2]
v0_z = np.ascontiguousarray(v0_z)
v1_x = v1[:,0]
v1_x = np.ascontiguousarray(v1_x)
v1_y = v1[:,1]
v1_y = np.ascontiguousarray(v1_y)
v1_z = v1[:,2]
v1_z = np.ascontiguousarray(v1_z)
v2_x = v2[:,0]
v2_x = np.ascontiguousarray(v2_x)
v2_y = v2[:,1]
v2_y = np.ascontiguousarray(v2_y)
v2_z = v2[:,2]
v2_z = np.ascontiguousarray(v2_z)
mod = SourceModule("""
#include <math.h>
__global__ void intersect(float *origin,float *dir_x,float *dir_y,float *dir_z,float *v0_x,float *v0_y,float *v0_z,float *v1_x,float *v1_y,float *v1_z,float *v2_x,float *v2_y,float *v2_z,float *int_point_real_x, float *int_point_real_y,float *int_point_real_z)
{
using namespace std;
unsigned int idx = blockDim.x*blockIdx.x + threadIdx.x;
int count = 0;
float v0_current[3];
float v1_current[3];
float v2_current[3];
float dir_current[3] = {dir_x[idx],dir_y[idx],dir_z[idx]};
float int_point[3];
float int_pointS[2][3];
int int_faces[2];
float dist[2];
dist[0] = -999;
int n_tri = 105500;
for(int i = 0; i<n_tri; i++) {
v0_current[0] = v0_x[i];
v0_current[1] = v0_y[i];
v0_current[2] = v0_z[i];
v1_current[0] = v1_x[i];
v1_current[1] = v1_y[i];
v1_current[2] = v1_z[i];
v2_current[0] = v2_x[i];
v2_current[1] = v2_y[i];
v2_current[2] = v2_z[i];
double eps = 0.0000001;
float E1[3];
float E2[3];
float s[3];
for (int j = 0; j < 3; j++) {
E1[j] = v1_current[j] - v0_current[j];
E2[j] = v2_current[j] - v0_current[j];
s[j] = origin[j] - v0_current[j];
}
float h[3];
h[0] = dir_current[1] * E2[2] - dir_current[2] * E2[1];
h[1] = -(dir_current[0] * E2[2] - dir_current[2] * E2[0]);
h[2] = dir_current[0] * E2[1] - dir_current[1] * E2[0];
float a;
a = E1[0] * h[0] + E1[1] * h[1] + E1[2] * h[2];
if (a > -eps && a < eps) {
int_point[0] = false;
}
else {
double f = 1 / a;
float u;
u = f * (s[0] * h[0] + s[1] * h[1] + s[2] * h[2]);
if (u < 0 || u > 1) {
int_point[0] = false;
}
else {
float q[3];
q[0] = s[1] * E1[2] - s[2] * E1[1];
q[1] = -(s[0] * E1[2] - s[2] * E1[0]);
q[2] = s[0] * E1[1] - s[1] * E1[0];
float v;
v = f * (dir_current[0] * q[0] + dir_current[1] * q[1] + dir_current[2] * q[2]);
if (v < 0 || (u + v)>1) {
int_point[0] = false;
}
else {
float t;
t = f * (E2[0] * q[0] + E2[1] * q[1] + E2[2] * q[2]);
if (t > eps) {
for (int j = 0; j < 3; j++) {
int_point[j] = origin[j] + dir_current[j] * t;
}
//return t;
}
}
}
}
if (int_point[0] != false) {
count = count+1;
int_faces[count-1] = i;
dist[count-1] = sqrt(pow((origin[0] - int_point[0]), 2) + pow((origin[1] - int_point[1]), 2) + pow((origin[2] - int_point[2]), 2));
for (int j = 0; j<3; j++) {
int_pointS[count-1][j] = int_point[j];
}
}
}
double min = dist[0];
int ind_min = 0;
for (int i = 0; i < 2; i++){
if (min > dist[i]) {
min = dist[i];
ind_min = i;
}
}
if (dist[0] < -998){
int_point_real_x[idx] = -999;
int_point_real_y[idx] = -999;
int_point_real_z[idx] = -999;
}
else{
int_point_real_x[idx] = int_pointS[ind_min][0];
int_point_real_y[idx] = int_pointS[ind_min][1];
int_point_real_z[idx] = int_pointS[ind_min][2];
}
}
""")
n_rays = 20000
num_threads = 1024
num_blocks = int(n_rays/num_threads)
origin = np.asarray([-2, -2, -2]).astype(np.float32)
origin = np.ascontiguousarray(origin)
rand_x = np.random.randn(n_rays)
rand_y = np.random.randn(n_rays)
rand_z = np.random.randn(n_rays)
direction_x = np.ones((n_rays, 1)) * 3
direction_x = direction_x.astype(np.float32)
direction_x = np.ascontiguousarray(direction_x)
direction_y = np.ones((n_rays, 1)) * 4
direction_y = direction_y.astype(np.float32)
direction_y = np.ascontiguousarray(direction_y)
direction_z = np.ones((n_rays, 1)) * 5
direction_z = direction_z.astype(np.float32)
direction_z = np.ascontiguousarray(direction_z)
int_point_real_x = np.zeros((n_rays, 1)).astype(np.float32)
int_point_real_x = np.ascontiguousarray(int_point_real_x)
int_point_real_y = np.zeros((n_rays, 1)).astype(np.float32)
int_point_real_y = np.ascontiguousarray(int_point_real_y)
int_point_real_z = np.zeros((n_rays, 1)).astype(np.float32)
int_point_real_z = np.ascontiguousarray(int_point_real_z)
intersect = mod.get_function("intersect")
start = time.time()
intersect(drv.In(origin), drv.In(direction_x),drv.In(direction_y),drv.In(direction_z),drv.In(v0_x),drv.In(v0_y),drv.In(v0_z), drv.In(v1_x),drv.In(v1_y),drv.In(v1_z), drv.In(v2_x), drv.In(v2_y), drv.In(v2_z), drv.Out(int_point_real_x),drv.Out(int_point_real_y),drv.Out(int_point_real_z), block=(num_threads, 1, 1), grid=((num_blocks+0), 1, 1))
finish = time.time()
print(finish-start)
I give as input some arrays whose size is 20k (dir_x
, dir_y
, dir_z
) and I have as output 3 arrays (int_point_real_x
,int_point_real_y
,int_point_real_z
) that have the same size as the above mentioned arrays (20k).
If n_rays
is a multiple of num_threads
, e.g. n_rays=19456
and num_threads=1024
, then int_point_real_x_y_z
are correctly filled by the kernel.
Otherwise, if n_rays
is NOT a multiple of num_threads
, e.g. n_rays=20000
(what I really need) and num_threads=1024
, then int_point_real_x_y_z
are filled by the kernel up to position 19455 and the 544 spots left in the array are not filled.
Does anyone know if this is a rule of CUDA?
If it's not, how could I modify my code in order to use an arbitrary size of input array (and not only multiple of num_threads
)?
Thanks