I noticed an unbalanced memory read and write amount when profiling the underneath cuda kernel using ncu.
__global__ void kernel(void* mem, int n) {
int* ptr = reinterpret_cast<int*>(mem);
for (int offset = (threadIdx.x + blockIdx.x * blockDim.x)*32; offset < n; offset += blockDim.x * gridDim.x * 32) {
#pragma unroll
for (int i = 0; i < 16; i++) {
ptr[offset + i] = ptr[offset + i + 16];
}
}
}
int main() {
int* mem;
int N = 1024 * 256 * 256;
cudaMalloc((void**)&mem, sizeof(int) * N);
cudaMemset(mem, 0, sizeof(int) * N);
kernel<<<8192, 256>>>(mem, N);
cudaFree(mem);
return 0;
}
In ncu, it tells me that memory read is 305 MB while memory write is 1.07GB. I understand that there is global memory coalescing, but shouldn’t the memory read and write both be equal to approximately 1GB, instead of only 305 MB memory read? And even if there is no global memory coalescing for memory read, shouldn’t the memory read amount be equal to around 128MB?
Thanks.