CUDA: does size of input/output data have to be a multiple of the number of threads per block?

Question

I have a Python code (for implementing RayTracing) that I'm running in parallel with PyCuda.

import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
from stl import mesh
import time
my_mesh = mesh.Mesh.from_file('test_solid_py.stl')
n = my_mesh.normals
v0 = my_mesh.v0
v1 = my_mesh.v1
v2 = my_mesh.v2
v0_x = v0[:,0]
v0_x = np.ascontiguousarray(v0_x)
v0_y = v0[:,1]
v0_y = np.ascontiguousarray(v0_y)
v0_z = v0[:,2]
v0_z = np.ascontiguousarray(v0_z)
v1_x = v1[:,0]
v1_x = np.ascontiguousarray(v1_x)
v1_y = v1[:,1]
v1_y = np.ascontiguousarray(v1_y)
v1_z = v1[:,2]
v1_z = np.ascontiguousarray(v1_z)
v2_x = v2[:,0]
v2_x = np.ascontiguousarray(v2_x)
v2_y = v2[:,1]
v2_y = np.ascontiguousarray(v2_y)
v2_z = v2[:,2]
v2_z = np.ascontiguousarray(v2_z)

mod = SourceModule("""
    #include <math.h>
  __global__ void intersect(float *origin,float *dir_x,float *dir_y,float *dir_z,float *v0_x,float *v0_y,float *v0_z,float *v1_x,float *v1_y,float *v1_z,float *v2_x,float *v2_y,float *v2_z,float *int_point_real_x, float *int_point_real_y,float *int_point_real_z)
  {
    using namespace std;
    unsigned int idx = blockDim.x*blockIdx.x + threadIdx.x;
    int count = 0;

    float v0_current[3];
    float v1_current[3];
    float v2_current[3];
    float dir_current[3] = {dir_x[idx],dir_y[idx],dir_z[idx]}; 
    float int_point[3];
    float int_pointS[2][3];
    int int_faces[2];
    float dist[2];
    dist[0] = -999;
    int n_tri = 105500;

    for(int i = 0; i<n_tri; i++) {
        v0_current[0] = v0_x[i];
        v0_current[1] = v0_y[i];
        v0_current[2] = v0_z[i];
        v1_current[0] = v1_x[i];
        v1_current[1] = v1_y[i];
        v1_current[2] = v1_z[i];
        v2_current[0] = v2_x[i];
        v2_current[1] = v2_y[i];
        v2_current[2] = v2_z[i];
        double eps = 0.0000001;
        float E1[3];
        float E2[3];
        float s[3];
        for (int j = 0; j < 3; j++) {
            E1[j] = v1_current[j] - v0_current[j];
            E2[j] = v2_current[j] - v0_current[j];
            s[j] = origin[j] - v0_current[j];
        }
        float h[3];
        h[0] = dir_current[1] * E2[2] - dir_current[2] * E2[1];
        h[1] = -(dir_current[0] * E2[2] - dir_current[2] * E2[0]);
        h[2] = dir_current[0] * E2[1] - dir_current[1] * E2[0];
        float a;
        a = E1[0] * h[0] + E1[1] * h[1] + E1[2] * h[2];
        if (a > -eps && a < eps) {
            int_point[0] = false;
        }
        else {
            double f = 1 / a;
            float u;
            u = f * (s[0] * h[0] + s[1] * h[1] + s[2] * h[2]);
            if (u < 0 || u > 1) {
                int_point[0] = false;
            }
            else {
                float q[3];
                q[0] = s[1] * E1[2] - s[2] * E1[1];
                q[1] = -(s[0] * E1[2] - s[2] * E1[0]);
                q[2] = s[0] * E1[1] - s[1] * E1[0];
                float v;
                v = f * (dir_current[0] * q[0] + dir_current[1] * q[1] + dir_current[2] * q[2]);
                if (v < 0 || (u + v)>1) {
                    int_point[0] = false;
                }
                else {
                    float t;
                    t = f * (E2[0] * q[0] + E2[1] * q[1] + E2[2] * q[2]);
                    if (t > eps) {
                        for (int j = 0; j < 3; j++) {
                            int_point[j] = origin[j] + dir_current[j] * t;
                        }
                        //return t;
                    }
                }
            }
        }
        if (int_point[0] != false) {
            count = count+1;
            int_faces[count-1] = i;
            dist[count-1] = sqrt(pow((origin[0] - int_point[0]), 2) + pow((origin[1] - int_point[1]), 2) + pow((origin[2] - int_point[2]), 2));
            for (int j = 0; j<3; j++) {
                int_pointS[count-1][j] = int_point[j];
            }
        }
    }
    double min = dist[0];
    int ind_min = 0;
    for (int i = 0; i < 2; i++){
        if (min > dist[i]) {
            min = dist[i];
            ind_min = i;
        }
    }
    if (dist[0] < -998){
        int_point_real_x[idx] = -999;
        int_point_real_y[idx] = -999;
        int_point_real_z[idx] = -999;
    }
    else{
        int_point_real_x[idx] = int_pointS[ind_min][0];
        int_point_real_y[idx] = int_pointS[ind_min][1];
        int_point_real_z[idx] = int_pointS[ind_min][2];
    }

}
  """)
n_rays = 20000
num_threads = 1024
num_blocks = int(n_rays/num_threads)
origin = np.asarray([-2, -2, -2]).astype(np.float32)
origin = np.ascontiguousarray(origin)
rand_x = np.random.randn(n_rays)
rand_y = np.random.randn(n_rays)
rand_z = np.random.randn(n_rays)
direction_x = np.ones((n_rays, 1)) * 3
direction_x = direction_x.astype(np.float32)
direction_x = np.ascontiguousarray(direction_x)
direction_y = np.ones((n_rays, 1)) * 4
direction_y = direction_y.astype(np.float32)
direction_y = np.ascontiguousarray(direction_y)
direction_z = np.ones((n_rays, 1)) * 5
direction_z = direction_z.astype(np.float32)
direction_z = np.ascontiguousarray(direction_z)
int_point_real_x = np.zeros((n_rays, 1)).astype(np.float32)
int_point_real_x = np.ascontiguousarray(int_point_real_x)
int_point_real_y = np.zeros((n_rays, 1)).astype(np.float32)
int_point_real_y = np.ascontiguousarray(int_point_real_y)
int_point_real_z = np.zeros((n_rays, 1)).astype(np.float32)
int_point_real_z = np.ascontiguousarray(int_point_real_z)

intersect = mod.get_function("intersect")
start = time.time()
intersect(drv.In(origin), drv.In(direction_x),drv.In(direction_y),drv.In(direction_z),drv.In(v0_x),drv.In(v0_y),drv.In(v0_z), drv.In(v1_x),drv.In(v1_y),drv.In(v1_z), drv.In(v2_x), drv.In(v2_y), drv.In(v2_z), drv.Out(int_point_real_x),drv.Out(int_point_real_y),drv.Out(int_point_real_z), block=(num_threads, 1, 1), grid=((num_blocks+0), 1, 1))
finish = time.time()
print(finish-start)

I give as input some arrays whose size is 20k (dir_x, dir_y, dir_z) and I have as output 3 arrays (int_point_real_x,int_point_real_y,int_point_real_z) that have the same size as the above mentioned arrays (20k).

If n_rays is a multiple of num_threads, e.g. n_rays=19456 and num_threads=1024, then int_point_real_x_y_z are correctly filled by the kernel.

Otherwise, if n_rays is NOT a multiple of num_threads, e.g. n_rays=20000 (what I really need) and num_threads=1024, then int_point_real_x_y_z are filled by the kernel up to position 19455 and the 544 spots left in the array are not filled.

Does anyone know if this is a rule of CUDA? If it's not, how could I modify my code in order to use an arbitrary size of input array (and not only multiple of num_threads)?

Thanks

Fully one third of the kernel code you have posted is commented out and does nothing. I am not sure what your interpretation on an [MCVE] is, but mine wouldn't include a kernel with 30+ lines of irrelevant commented out code in it — talonmies, Dec 14 '18 at 11:50
I edited the question and removed all the commented lines. If you need the 'test_solid_py.stl' file I can provide it in some way — Davide Bassano, Dec 14 '18 at 14:08

score 1 · Answer 1 · answered Dec 14 '18 at 12:18

1

your int(n_rays/num_threads) is rounding down

to fix this, you need to round up and then put a condition into the kernel to enforce that idx is valid and "do nothing" if it's not. this will cause some cores to waste time, but your code looks pretty suboptimal anyway so it probably won't matter much

answered Dec 14 '18 at 12:18

Sam Mason

15,216
1
41
60

Thanks for your reply. What do you mean with "enforce that idx is valid"? – Davide Bassano Dec 14 '18 at 14:09
1

here's an example, read [this](https://stackoverflow.com/questions/26217294/should-i-check-the-number-of-threads-in-kernel-code/26217725#26217725) – Robert Crovella Dec 14 '18 at 14:47

CUDA: does size of input/output data have to be a multiple of the number of threads per block?

1 Answers1