0

I make this program to practice cudaMemcpy3D() and Texture Memory.

Here comes the questions,when I print out tex3D data,it is not same as initial data.The value I get is ncrss times the initial value, and there are ncrss interval numbers which equal to 0 between each other. If I set nsubs to 2 or other bigger one, the time should be ncrss*nsubs and interval will be ncrss*nsubs.

Can you piont out where I made the mistakes. I think it probably is make_cudaPitchedPtr at line 61, or make_cudaExtent at line 56. And also may related with the way of array storaged. So I come here for your help,appreciate for your comments and advices.

  1 #include<stdio.h>
  2 #include<stdlib.h>
  3 #include<cuda_runtime.h>
  4 #include<helper_functions.h>
  5 #include<helper_cuda.h>
  6 #ifndef MIN
  7 #define  MIN(A,B)  ((A) < (B) ?  (A) : (B))
  8 #endif
  9 #ifndef MAX
 10 #define  MAX(A,B)  ((A) > (B) ?  (A) : (B))
 11 #endif
 12 
 13 texture<float,cudaTextureType3D,cudaReadModeElementType> vel_tex;
 14 
 15 __global__ void  mckernel(int ntab)
 16 {
 17         const int biy=blockIdx.y;//sub
 18         const int bix=blockIdx.x;//crs
 19         const int tid=threadIdx.x;
 20 
 21         float test;
 22         test=tex3D(vel_tex,biy,bix,tid);
 23         printf("test=%f,bix=%d,tid=%d\n",test,bix,tid);
 24 
 25 }
 26 
 27 int main()
 28 {
 29         int n=10;//208
 30         int ntab=10;
 31         int submin=1;
 32         int crsmin=1;
 33         int submax=1;
 34         int crsmax=2;
 35         int subinc=1;
 36         int crsinc=1;
 37 
 38         int ncrss,nsubs;
 39         ncrss=(crsmax-crsmin)/crsinc + 1;
 40         nsubs=(submax-submin)/subinc + 1;
 41         dim3 BlockPerGrid(ncrss,nsubs,1);
 42         dim3 ThreadPerBlock(n,1,1);
 43 
 44         float vel[nsubs][ncrss][ntab];
 45         int i,j,k;
 46         for(i=0;i<nsubs;i++)
 47                 for(j=0;j<ncrss;j++)
 48                         for(k=0;k<ntab;k++)
 49                                 vel[i][j][k]=k;
 50         for(i=0;i<nsubs;i++)
 51                 for(j=0;j<ncrss;j++)
 52                         for(k=0;k<ntab;k++)
 53                                 printf("vel[%d][%d][%d]=%f\n",i,j,k,vel[i][j][k]);
 54 
 55         cudaChannelFormatDesc velchannelDesc=cudaCreateChannelDesc<float>();
 56         cudaExtent velExtent=make_cudaExtent(nsubs,ncrss,ntab);
 57         cudaArray *d_vel;
 58         cudaMalloc3DArray(&d_vel,&velchannelDesc,velExtent);
 59 
 60         cudaMemcpy3DParms velParms = {0};
 61         velParms.srcPtr=make_cudaPitchedPtr((void*)vel,sizeof(float)*nsubs,nsubs,ncrss);
 62         velParms.dstArray=d_vel;
 63         velParms.extent=velExtent;
 64         velParms.kind=cudaMemcpyHostToDevice;
 65         cudaMemcpy3D(&velParms);
 66 
 67         cudaBindTextureToArray(vel_tex,d_vel);
 68 
 69         printf("kernel start\n");
 70         cudaDeviceSynchronize();
 71         mckernel<<<BlockPerGrid,ThreadPerBlock>>>(ntab);
 72         printf("kernel end\n");
 73 
 74         cudaUnbindTexture(vel_tex);
 75         cudaFreeArray(d_vel);
 76         cudaDeviceReset();
 77         return 0 ;
 78 }

Here comes the printf data,nsubs=1 and ncrss=2;

  1 vel[0][0][0]=0.000000
  2 vel[0][0][1]=1.000000
  3 vel[0][0][2]=2.000000
  4 vel[0][0][3]=3.000000
  5 vel[0][0][4]=4.000000
  6 vel[0][0][5]=5.000000
  7 vel[0][0][6]=6.000000
  8 vel[0][0][7]=7.000000
  9 vel[0][0][8]=8.000000
 10 vel[0][0][9]=9.000000
 11 vel[0][1][0]=0.000000
 12 vel[0][1][1]=1.000000
 13 vel[0][1][2]=2.000000
 14 vel[0][1][3]=3.000000
 15 vel[0][1][4]=4.000000
 16 vel[0][1][5]=5.000000
 17 vel[0][1][6]=6.000000
 18 vel[0][1][7]=7.000000
 19 vel[0][1][8]=8.000000
 20 vel[0][1][9]=9.000000
 21 kernel start
 22 kernel end
 23 test=1.000000,bix=1,tid=0
 24 test=3.000000,bix=1,tid=1
 25 test=5.000000,bix=1,tid=2
 26 test=7.000000,bix=1,tid=3
 27 test=9.000000,bix=1,tid=4
 28 test=1.000000,bix=1,tid=5
 29 test=3.000000,bix=1,tid=6
 30 test=5.000000,bix=1,tid=7
 31 test=7.000000,bix=1,tid=8
 32 test=9.000000,bix=1,tid=9
 33 test=0.000000,bix=0,tid=0
 34 test=2.000000,bix=0,tid=1
 35 test=4.000000,bix=0,tid=2
 36 test=6.000000,bix=0,tid=3
 37 test=8.000000,bix=0,tid=4
 38 test=0.000000,bix=0,tid=5
 39 test=2.000000,bix=0,tid=6
 40 test=4.000000,bix=0,tid=7
 41 test=6.000000,bix=0,tid=8
 42 test=8.000000,bix=0,tid=9
Y.fes
  • 73
  • 9

1 Answers1

1

After a night thinking ,I find out the problem.

the cuda array load as M[fast][mid][low] while c array is M[low][mid][fast].

so dim3(),cudaExtent(),pitchedPtr()should be same to [low][mid][fast] or at least should be same as each other.

Y.fes
  • 73
  • 9