0

I have a "string"(molecule) of connected N objects(atoms) in 3D (each atom has a coordinates). And I need to calculate a distance between each pair of atoms in a molecule (see pseudo code below ). How could it be done with CUDA? Should I pass to a kernel function 2 3D Arrays? Or 3 arrays with coordinates: X[N], Y[N], Z[N]? Thanks.

struct atom { double x,y,z; }

int main()
{

//N number of atoms in a molecule
double DistanceMatrix[N][N];
double d;
atom Atoms[N];

for (int i = 0; i < N; i ++)
    for (int j = 0; j < N; j++)
      DistanceMatrix[i][j] = (atoms[i].x -atoms[j].x)*(atoms[i].x -atoms[j].x) +
                     (atoms[i].y -atoms[j].y)* (atoms[i].y -atoms[j].y) + (atoms[i].z -atoms[j].z)* (atoms[i].z -atoms[j].z;

 }
sable
  • 5
  • 2
  • With cuda you want to access memory that is next to eachother. So find a data format where all the values you want to check are side by side in memory. – benathon Jan 18 '14 at 01:01
  • How many atoms will the molecules have? – Roger Dahl Jan 18 '14 at 03:14
  • To portforwardpodcast: Thank you, will try as Roger Dahl proposed – sable Jan 18 '14 at 15:21
  • To Roger Dahl: The molecule will have 40-3000 atoms. But distance calculation will be combined with Energy function calculation which depends on the distance. The energy can be very expensive from calculation point of view. – sable Jan 18 '14 at 15:27

1 Answers1

1

Unless you're working with very large molecules, there probably won't be enough work to keep the GPU busy, so calculations will be faster with the CPU.

If you meant to calculate the Euclidean distance, your calculation is not correct. You need the 3D version of the Pythagorean theorem.

I would use a SoA for storing the coordinates.

You want to generate a memory access pattern with as many coalesced reads and writes as possible. To do that, arrange for addresses or indexes generated by the 32 threads in each warp to be as close to each other as possible (a bit simplified).

threadIdx designates thread indexes within a block and blockIdx designates block indexes within the grid. blockIdx is always the same for all threads in a warp. Only threadIdx varies within the threads in a block. To visualize how the 3 dimensions of threadIdx are assigned to threads, think of them as nested loops where x is the inner loop and z is the outer loop. So, threads with adjacent x values are the most likely to be within the same warp and, if x is divisible by 32, only threads sharing the same x / 32 value are within the same warp.

I have included a complete example for your algorithm below. In the example, the i index is derived from threadIdx.x so, to check that warps would generate coalesced reads and writes, I would go over the code while inserting a few consecutive values such as 0, 1 and 2 for i and checking that the generated indexes would also be consecutive.

Addresses generated from the j index are less important as j is derived from threadIdx.y and so is less likely to vary within a warp (and will never vary if threadIdx.x is divisible by 32).

#include  "cuda_runtime.h"
#include <iostream>

using namespace std;

const int N(20);

#define check(ans) { _check((ans), __FILE__, __LINE__); }
inline void _check(cudaError_t code, char *file, int line)
{
  if (code != cudaSuccess) {
    fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
    exit(code);
  }
}

int div_up(int a, int b) {
  return ((a % b) != 0) ? (a / b + 1) : (a / b);
}

__global__ void calc_distances(double* distances,
  double* atoms_x, double* atoms_y, double* atoms_z);

int main(int argc, char **argv)
{
  double* atoms_x_h;
  check(cudaMallocHost(&atoms_x_h, N * sizeof(double)));

  double* atoms_y_h;
  check(cudaMallocHost(&atoms_y_h, N * sizeof(double)));

  double* atoms_z_h;
  check(cudaMallocHost(&atoms_z_h, N * sizeof(double)));

  for (int i(0); i < N; ++i) {
    atoms_x_h[i] = i;
    atoms_y_h[i] = i;
    atoms_z_h[i] = i;
  }

  double* atoms_x_d;
  check(cudaMalloc(&atoms_x_d, N * sizeof(double)));

  double* atoms_y_d;
  check(cudaMalloc(&atoms_y_d, N * sizeof(double)));

  double* atoms_z_d;
  check(cudaMalloc(&atoms_z_d, N * sizeof(double)));

  check(cudaMemcpy(atoms_x_d, atoms_x_h, N * sizeof(double), cudaMemcpyHostToDevice));
  check(cudaMemcpy(atoms_y_d, atoms_y_h, N * sizeof(double), cudaMemcpyHostToDevice));
  check(cudaMemcpy(atoms_z_d, atoms_z_h, N * sizeof(double), cudaMemcpyHostToDevice));

  double* distances_d;
  check(cudaMalloc(&distances_d, N * N * sizeof(double)));

  const int threads_per_block(256);
  dim3 n_blocks(div_up(N, threads_per_block));

  calc_distances<<<n_blocks, threads_per_block>>>(distances_d, atoms_x_d, atoms_y_d, atoms_z_d);

  check(cudaPeekAtLastError());
  check(cudaDeviceSynchronize());

  double* distances_h;
  check(cudaMallocHost(&distances_h, N * N * sizeof(double)));

  check(cudaMemcpy(distances_h, distances_d, N * N * sizeof(double), cudaMemcpyDeviceToHost));

  for (int i(0); i < N; ++i) {
    for (int j(0); j < N; ++j) {
      cout << "(" << i << "," << j << "): " << distances_h[i + N * j] << endl;
    }
  }

  check(cudaFree(distances_d));
  check(cudaFreeHost(distances_h));
  check(cudaFree(atoms_x_d));
  check(cudaFreeHost(atoms_x_h));
  check(cudaFree(atoms_y_d));
  check(cudaFreeHost(atoms_y_h));
  check(cudaFree(atoms_z_d));
  check(cudaFreeHost(atoms_z_h));

  return 0;
}

__global__ void calc_distances(double* distances,
  double* atoms_x, double* atoms_y, double* atoms_z)
{
  int i(threadIdx.x + blockIdx.x * blockDim.x);
  int j(threadIdx.y + blockIdx.y * blockDim.y);

  if (i >= N || j >= N) {
    return;
  }

  distances[i + N * j] =
    (atoms_x[i] - atoms_x[j]) * (atoms_x[i] - atoms_x[j]) +
    (atoms_y[i] - atoms_y[j]) * (atoms_y[i] - atoms_y[j]) +
    (atoms_z[i] - atoms_z[j]) * (atoms_z[i] - atoms_z[j]);
}
Community
  • 1
  • 1
Roger Dahl
  • 15,132
  • 8
  • 62
  • 82
  • I am a beginner to parallel programming. I will run your code first and probably will have more questions. About SoA: the molecule is presented by a very complicated C++ object. I will need to copy coordinates for atoms from that object to an array. Thanks a lot for your help. – sable Jan 18 '14 at 15:34
  • to Roger Dahl: I ran the code, and only (i,0) in output are filled correctly, other elements are equal to 0. But, probably I have a bug in my code – sable Jan 23 '14 at 22:44
  • I still cannot figure out why only (i,0) are filled. Just in case, post my qsub job code: – sable Jan 24 '14 at 20:32
  • #!/bin/bash #$ -V #$ -cwd #$ -j y # combine the stdout and stderr streams #$ -N toy2 # specify the executable #$ -l h_rt=1:30:00 # run time not to exceed one hour and 30 minutes #$ -o $JOB_NAME.out$JOB_ID # specify stdout & stderr output #$ -q gpu # specify the GPU queue #$ -pe 2way 24 # request 2 nodes (the 24) module load cuda set -x ibrun $HOME/GPU_TOY2/toy2 – sable Jan 24 '14 at 20:33