Cuda, Distance calculation btw 3d objects, again

Question

I am asking the same question again. https://stackoverflow.com/a/21200781/3208577 Roger Dahl answered the question, but after running the code I got wrong output. I have no idea why is that, from my understanding everything is correct. In main () function output for a distance matrix shows that only (i,0) elements are filled correctly, others are 0. Could anyone explain why is that? Let me repost Rogers' code :

    #include  "cuda_runtime.h"
    #include <iostream>

    using namespace std;

    const int N(20);

    #define check(ans) { _check((ans), __FILE__, __LINE__); }
    inline void _check(cudaError_t code, char *file, int line)
    {
      if (code != cudaSuccess) {
        fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
        exit(code);
      }
    }

    int div_up(int a, int b) {
      return ((a % b) != 0) ? (a / b + 1) : (a / b);
    }

    __global__ void calc_distances(double* distances,
      double* atoms_x, double* atoms_y, double* atoms_z);

    int main(int argc, char **argv)
    {
      double* atoms_x_h;
      check(cudaMallocHost(&atoms_x_h, N * sizeof(double)));

      double* atoms_y_h;
      check(cudaMallocHost(&atoms_y_h, N * sizeof(double)));

      double* atoms_z_h;
      check(cudaMallocHost(&atoms_z_h, N * sizeof(double)));

      for (int i(0); i < N; ++i) {
        atoms_x_h[i] = i;
        atoms_y_h[i] = i;
        atoms_z_h[i] = i;
      }

      double* atoms_x_d;
      check(cudaMalloc(&atoms_x_d, N * sizeof(double)));

      double* atoms_y_d;
      check(cudaMalloc(&atoms_y_d, N * sizeof(double)));

      double* atoms_z_d;
      check(cudaMalloc(&atoms_z_d, N * sizeof(double)));

      check(cudaMemcpy(atoms_x_d, atoms_x_h, N * sizeof(double), cudaMemcpyHostToDevice));
      check(cudaMemcpy(atoms_y_d, atoms_y_h, N * sizeof(double), cudaMemcpyHostToDevice));
      check(cudaMemcpy(atoms_z_d, atoms_z_h, N * sizeof(double), cudaMemcpyHostToDevice));

      double* distances_d;
      check(cudaMalloc(&distances_d, N * N * sizeof(double)));

      const int threads_per_block(256);
      dim3 n_blocks(div_up(N, threads_per_block));

      calc_distances<<<n_blocks, threads_per_block>>>(distances_d, atoms_x_d, atoms_y_d, atoms_z_d);

      check(cudaPeekAtLastError());
      check(cudaDeviceSynchronize());

      double* distances_h;
      check(cudaMallocHost(&distances_h, N * N * sizeof(double)));

      check(cudaMemcpy(distances_h, distances_d, N * N * sizeof(double), cudaMemcpyDeviceToHost));
//wrong output here
      for (int i(0); i < N; ++i) {
        for (int j(0); j < N; ++j) {
          cout << "(" << i << "," << j << "): " << distances_h[i + N * j] << endl;
        }
      }

      check(cudaFree(distances_d));
      check(cudaFreeHost(distances_h));
      check(cudaFree(atoms_x_d));
      check(cudaFreeHost(atoms_x_h));
      check(cudaFree(atoms_y_d));
      check(cudaFreeHost(atoms_y_h));
      check(cudaFree(atoms_z_d));
      check(cudaFreeHost(atoms_z_h));

      return 0;
    }

    __global__ void calc_distances(double* distances,
      double* atoms_x, double* atoms_y, double* atoms_z)
    {
      int i(threadIdx.x + blockIdx.x * blockDim.x);
      int j(threadIdx.y + blockIdx.y * blockDim.y);

      if (i >= N || j >= N) {
        return;
      }

      distances[i + N * j] =
        (atoms_x[i] - atoms_x[j]) * (atoms_x[i] - atoms_x[j]) +
        (atoms_y[i] - atoms_y[j]) * (atoms_y[i] - atoms_y[j]) +
        (atoms_z[i] - atoms_z[j]) * (atoms_z[i] - atoms_z[j]);
    }

ngimel · Accepted Answer · 2014-01-25T01:10:53.943

1

For your kernel as written, your launch parameters are wrong. They should be

  const int threads_per_dir(16);
  dim3 threads_per_block(threads_per_dir,threads_per_dir);
  dim3 n_blocks(div_up(N, threads_per_block.x), div_up(N,threads_per_block.y));

When you are launching 1D blocks on a 1D grid, as you do in your code, your threadIdx.y and blockIdx.y are always 0, and so j is always 0.

edited Jan 25 '14 at 01:10

answered Jan 25 '14 at 01:04

ngimel

354
1
7

Cuda, Distance calculation btw 3d objects, again

1 Answers1