I want to write a kernel to perform computations that depends on all the unique quartets of indices (ij|kl). The code that generate all the unique quartets on the host is as follows:
#include <iostream>
using namespace std;
int main(int argc, char** argv)
{
unsigned int i,j,k,l,ltop;
unsigned int nao=7;
for(i=0;i<nao;i++)
{
for(j=0;j<=i;j++)
{
for(k=0;k<=i;k++)
{
ltop=k;
if(i==k)
{
ltop=j;
}
for(l=0;l<=ltop; l++)
{
printf("computing the ERI (%d,%d|%d,%d) \n",i,j,k,l);
}
}
}
}
int m = nao*(nao+1)/2;
int eris_size = m*(m+1)/2;
cout<<"The total size of the sack of ERIs is: "<<eris_size<<endl;
return 0;
}
output:
computing the ERI (0,0|0,0)
computing the ERI (1,0|0,0)
computing the ERI (1,0|1,0)
computing the ERI (1,1|0,0)
computing the ERI (1,1|1,0)
computing the ERI (1,1|1,1)
computing the ERI (2,0|0,0)
computing the ERI (2,0|1,0)
computing the ERI (2,0|1,1)
computing the ERI (2,0|2,0)
computing the ERI (2,1|0,0)
computing the ERI (2,1|1,0)
computing the ERI (2,1|1,1)
computing the ERI (2,1|2,0)
computing the ERI (2,1|2,1)
computing the ERI (2,2|0,0)
computing the ERI (2,2|1,0)
computing the ERI (2,2|1,1)
computing the ERI (2,2|2,0)
computing the ERI (2,2|2,1)
computing the ERI (2,2|2,2)
computing the ERI (3,0|0,0)
computing the ERI (3,0|1,0)
computing the ERI (3,0|1,1)
computing the ERI (3,0|2,0)
computing the ERI (3,0|2,1)
computing the ERI (3,0|2,2)
computing the ERI (3,0|3,0)
computing the ERI (3,1|0,0)
computing the ERI (3,1|1,0)
computing the ERI (3,1|1,1)
computing the ERI (3,1|2,0)
computing the ERI (3,1|2,1)
computing the ERI (3,1|2,2)
computing the ERI (3,1|3,0)
computing the ERI (3,1|3,1)
computing the ERI (3,2|0,0)
computing the ERI (3,2|1,0)
computing the ERI (3,2|1,1)
computing the ERI (3,2|2,0)
computing the ERI (3,2|2,1)
computing the ERI (3,2|2,2)
computing the ERI (3,2|3,0)
computing the ERI (3,2|3,1)
computing the ERI (3,2|3,2)
computing the ERI (3,3|0,0)
computing the ERI (3,3|1,0)
computing the ERI (3,3|1,1)
computing the ERI (3,3|2,0)
computing the ERI (3,3|2,1)
computing the ERI (3,3|2,2)
computing the ERI (3,3|3,0)
computing the ERI (3,3|3,1)
computing the ERI (3,3|3,2)
computing the ERI (3,3|3,3)
I want to recover the same set of quartets using a CUDA kernel but I cannot get it. The code for CUDA that I have by now is as follows
#include <iostream>
#include <stdio.h>
using namespace std;
#define ABS(x) (x<0)?-x:x
__global__
void test_kernel(int basis_size)
{
unsigned int i_idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int j_idx = threadIdx.y + blockIdx.y * blockDim.y;
// Building the quartets
unsigned int i_orbital, j_orbital, k_orbital, l_orbital, i__1,j__1;
unsigned int i_primitive, j_primitive, k_primitive, l_primitive;
i_orbital = (i_idx + 1)%basis_size /*+1*/;
j_orbital = (i__1 = (i_idx) / basis_size, ABS(i__1)) /*+ 1*/;
k_orbital = (j_idx+1)%basis_size /*+1*/;
l_orbital = (j__1 = (j_idx) / basis_size, ABS(j__1)) /*+ 1*/;
unsigned int ltop;
ltop=k_orbital;
if(i_orbital==k_orbital)
{
ltop=j_orbital;
}
if(i_orbital<basis_size && j_orbital<=i_orbital && k_orbital<=i_orbital && l_orbital<=ltop)
printf("computing the ERI (%d,%d|%d,%d) \n", i_orbital, j_orbital,k_orbital,l_orbital);
}
int main(int argc, char *argv[])
{
int nao = 7;
cudaDeviceReset();
/* partitioning from blocks to grids */
int dimx = 8;
int dimy = 8;
dim3 block(dimx, dimy); // we will try blocks of 8x8 threads
dim3 grid((nao+block.x-1)/block.x, (nao+block.y-1)/block.y); // The grids are shaped accordingly
/* Launch the kernel */
test_kernel<<<grid,block>>>(nao);
cudaDeviceReset();
return 0;
}
output:
computing the ERI (2,0|1,1)
computing the ERI (3,1|3,1)
computing the ERI (3,0|1,1)
computing the ERI (1,1|1,1)
computing the ERI (2,1|1,1)
computing the ERI (3,1|1,1)
computing the ERI (3,0|2,1)
computing the ERI (2,1|2,1)
computing the ERI (3,1|2,1)
computing the ERI (1,0|1,0)
computing the ERI (2,0|1,0)
computing the ERI (3,0|1,0)
computing the ERI (1,1|1,0)
computing the ERI (2,1|1,0)
computing the ERI (3,1|1,0)
computing the ERI (2,0|2,0)
computing the ERI (3,0|3,0)
computing the ERI (3,0|2,0)
computing the ERI (2,1|2,0)
computing the ERI (3,1|3,0)
computing the ERI (3,1|2,0)
computing the ERI (1,0|0,0)
computing the ERI (2,0|0,0)
computing the ERI (3,0|0,0)
computing the ERI (0,0|0,0)
computing the ERI (1,1|0,0)
computing the ERI (2,1|0,0)
computing the ERI (3,1|0,0)
The quartets will drive the computation of repulsion integrals whose input parameters are stored in an array of size nao with 3