I wrote a test to illustrate my problem, the code try to copy 16 bytes to an none-4-bytes-aligned memory, but the dest is modified automatically
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
__global__
void Copy128(char *dest,const char *src)
{
((int*)dest)[0]=((int*)src)[0];
((int*)dest)[1]=((int*)src)[1];
((int*)dest)[2]=((int*)src)[2];
((int*)dest)[3]=((int*)src)[3];
}
__global__
void fill_src(char *src)
{
for(int i=0; i<16; i++)
src[i] = i+1; // starts from 1
}
int main()
{
char* dest;
cudaMalloc(&dest, 17);
char* src;
cudaMalloc(&src, 16);
fill_src<<<1, 1>>>((char*)src); // fill some value for debugging
// copy to dest+1 which is not aligned to 4
Copy128<<<1, 1>>>(dest + 1, src);
getchar();
}
Debugging the code in VS2013 as in the image, the target memory is 0x40A80001, but actually it copies to 0x40A80000.
The problem is the dest would be modified automatically if it's not aligned to 4-byte. And it's modified silently, I spent hours to find this bug.
I know it's best to use well aligned memory, but I'm writing some rar decompression program, decompress some bytes then concat some bytes, it cannot be always aligned.
I guess I would use uint64 in function like Copy256. Is this normal behavior that memory is force aligned? Any compiling flags that can switch off this functionality? Or should I copy bytes one by one?
Environment: CUDA 6.5, Win7-32bit, VS2013