I make this program to practice cudaMemcpy3D() and Texture Memory.
Here comes the questions,when I print out tex3D
data,it is not same as initial data.The value I get is ncrss
times the initial value, and there are ncrss
interval numbers which equal to 0 between each other. If I set nsubs
to 2 or other bigger one, the time should be ncrss*nsubs
and interval will be ncrss*nsubs
.
Can you piont out where I made the mistakes. I think it probably is make_cudaPitchedPtr
at line 61, or make_cudaExtent
at line 56. And also may related with the way of array storaged.
So I come here for your help,appreciate for your comments and advices.
1 #include<stdio.h>
2 #include<stdlib.h>
3 #include<cuda_runtime.h>
4 #include<helper_functions.h>
5 #include<helper_cuda.h>
6 #ifndef MIN
7 #define MIN(A,B) ((A) < (B) ? (A) : (B))
8 #endif
9 #ifndef MAX
10 #define MAX(A,B) ((A) > (B) ? (A) : (B))
11 #endif
12
13 texture<float,cudaTextureType3D,cudaReadModeElementType> vel_tex;
14
15 __global__ void mckernel(int ntab)
16 {
17 const int biy=blockIdx.y;//sub
18 const int bix=blockIdx.x;//crs
19 const int tid=threadIdx.x;
20
21 float test;
22 test=tex3D(vel_tex,biy,bix,tid);
23 printf("test=%f,bix=%d,tid=%d\n",test,bix,tid);
24
25 }
26
27 int main()
28 {
29 int n=10;//208
30 int ntab=10;
31 int submin=1;
32 int crsmin=1;
33 int submax=1;
34 int crsmax=2;
35 int subinc=1;
36 int crsinc=1;
37
38 int ncrss,nsubs;
39 ncrss=(crsmax-crsmin)/crsinc + 1;
40 nsubs=(submax-submin)/subinc + 1;
41 dim3 BlockPerGrid(ncrss,nsubs,1);
42 dim3 ThreadPerBlock(n,1,1);
43
44 float vel[nsubs][ncrss][ntab];
45 int i,j,k;
46 for(i=0;i<nsubs;i++)
47 for(j=0;j<ncrss;j++)
48 for(k=0;k<ntab;k++)
49 vel[i][j][k]=k;
50 for(i=0;i<nsubs;i++)
51 for(j=0;j<ncrss;j++)
52 for(k=0;k<ntab;k++)
53 printf("vel[%d][%d][%d]=%f\n",i,j,k,vel[i][j][k]);
54
55 cudaChannelFormatDesc velchannelDesc=cudaCreateChannelDesc<float>();
56 cudaExtent velExtent=make_cudaExtent(nsubs,ncrss,ntab);
57 cudaArray *d_vel;
58 cudaMalloc3DArray(&d_vel,&velchannelDesc,velExtent);
59
60 cudaMemcpy3DParms velParms = {0};
61 velParms.srcPtr=make_cudaPitchedPtr((void*)vel,sizeof(float)*nsubs,nsubs,ncrss);
62 velParms.dstArray=d_vel;
63 velParms.extent=velExtent;
64 velParms.kind=cudaMemcpyHostToDevice;
65 cudaMemcpy3D(&velParms);
66
67 cudaBindTextureToArray(vel_tex,d_vel);
68
69 printf("kernel start\n");
70 cudaDeviceSynchronize();
71 mckernel<<<BlockPerGrid,ThreadPerBlock>>>(ntab);
72 printf("kernel end\n");
73
74 cudaUnbindTexture(vel_tex);
75 cudaFreeArray(d_vel);
76 cudaDeviceReset();
77 return 0 ;
78 }
Here comes the printf data,nsubs=1
and ncrss=2
;
1 vel[0][0][0]=0.000000
2 vel[0][0][1]=1.000000
3 vel[0][0][2]=2.000000
4 vel[0][0][3]=3.000000
5 vel[0][0][4]=4.000000
6 vel[0][0][5]=5.000000
7 vel[0][0][6]=6.000000
8 vel[0][0][7]=7.000000
9 vel[0][0][8]=8.000000
10 vel[0][0][9]=9.000000
11 vel[0][1][0]=0.000000
12 vel[0][1][1]=1.000000
13 vel[0][1][2]=2.000000
14 vel[0][1][3]=3.000000
15 vel[0][1][4]=4.000000
16 vel[0][1][5]=5.000000
17 vel[0][1][6]=6.000000
18 vel[0][1][7]=7.000000
19 vel[0][1][8]=8.000000
20 vel[0][1][9]=9.000000
21 kernel start
22 kernel end
23 test=1.000000,bix=1,tid=0
24 test=3.000000,bix=1,tid=1
25 test=5.000000,bix=1,tid=2
26 test=7.000000,bix=1,tid=3
27 test=9.000000,bix=1,tid=4
28 test=1.000000,bix=1,tid=5
29 test=3.000000,bix=1,tid=6
30 test=5.000000,bix=1,tid=7
31 test=7.000000,bix=1,tid=8
32 test=9.000000,bix=1,tid=9
33 test=0.000000,bix=0,tid=0
34 test=2.000000,bix=0,tid=1
35 test=4.000000,bix=0,tid=2
36 test=6.000000,bix=0,tid=3
37 test=8.000000,bix=0,tid=4
38 test=0.000000,bix=0,tid=5
39 test=2.000000,bix=0,tid=6
40 test=4.000000,bix=0,tid=7
41 test=6.000000,bix=0,tid=8
42 test=8.000000,bix=0,tid=9