cudaMemset2D
doesn't create anything. Like memset
it sets an already created memory object to particular byte values.
The API that creates a pitched allocation is cudaMallocPitch
. (can also use cudaMalloc3D
).
If you have an allocation created with cudaMallocPitch
, then the correct API to use is cudaMemcpy2D
(assuming cudaArray
is not involved which seems to be the case here). (for cudaMalloc3D
you could use cudaMemcpy3D
)
cudaMemcpy2D
can copy from a pitched or unpitched allocation to a pitched or unpitched allocation. To copy to an unpitched (flat/linear) allocation, you simply set the pitch value in the destination equal to the width of the copied line (in bytes, for both).
There are numerous questions here on the cuda
tag that demonstrate various usages of cudaMallocPitch
/cudaMemcpy2D
. Here is one that matches your description:
int main(){
int *a, *b;
size_t pitch;
const size_t width = 32;
const size_t height = 1024;
cudaMallocPitch(&a, &pitch, width*sizeof(a[0]), height);
cudaMalloc(&b, width*height*sizeof(b[0]));
cudaMemcpy2D(b, width*sizeof(b[0]), a, pitch, width*sizeof(a[0]), height, cudaMemcpyDeviceToDevice);
}
If you were starting with a pre-existing cudaPitchedPtr
, it could be like this:
int main(){
int *b;
cudaPitchedPtr a;
...
size_t pitch = a.pitch
const size_t width = a.xsize;
const size_t height = a.ysize;
cudaMalloc(&b, width*height*sizeof(b[0]));
cudaMemcpy2D(b, width*sizeof(b[0]), a.ptr, pitch, width*sizeof(a.ptr[0]), height, cudaMemcpyDeviceToDevice);
}
Here is a verification:
$ cat t1861.cu
#include <stdio.h>
const size_t width = 32;
const size_t height = 1024;
__global__ void k1(int *a, size_t pitch){
char *ca = (char *)a;
int **ia = (int **)&ca;
for (int i = 0; i < height; i++){
for (int j = 0; j < width; j++)
(*ia)[j] = i*width+j;
ca += pitch;
}
}
__global__ void k2(int *b){
for (int i = 0; i < 5; i++){
for (int j = 0; j< width; j++) printf("%d ", b[i*width+j]);
printf("\n");}
}
int main(){
int *a, *b;
size_t pitch;
cudaMallocPitch(&a, &pitch, width*sizeof(a[0]), height);
cudaMalloc(&b, width*height*sizeof(b[0]));
k1<<<1,1>>>(a, pitch);
cudaMemcpy2D(b, width*sizeof(b[0]), a, pitch, width*sizeof(a[0]), height, cudaMemcpyDeviceToDevice);
k2<<<1,1>>>(b);
cudaDeviceSynchronize();
cudaPitchedPtr c = make_cudaPitchedPtr(a, width * sizeof(a[0]), width, height);
printf("%lu\n", c.xsize);
}
$ nvcc -o t1861 t1861.cu
$ ./t1861
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
32
$