1

I have a problem in reading from device memory of GPU. When I copy values to __device__ memory, everything is OK! But when I am trying to get the result back, the answer some times is OK and sometimes is exactly the first values of the array !

I have a device array like this:

__device__ array[50];

at start I copied some values into that:

cudaStatus = cudaMemcpyToSymbol(dev_state, &CipherState, statesize, 0, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf(" \n%s\n", cudaGetErrorString(cudaStatus));
        getchar();
    }

after doing some changes in the Kernel, I try to read values from the array:

Kernel << <8, 16 >> >();

unsigned char CipherState2[50];

cudaStatus = cudaMemcpyFromSymbol(&CipherState2, dev_state, 50*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) 
    {
        printf(" \n%s\n", cudaGetErrorString(cudaStatus));
        getchar();
    } 

The results are sometimes TRUE and sometimes first values of array.

Here is more of my code:

//before Kernel Function body

__device__ unsigned char dev_state[128];

//////////////////////////////////////

void test()
{

    unsigned char CipherState[128];

    for (int i = 0; i<128; i++)                 
        CipherState[i] = 0x01;

    cudaError_t cudaStatus;

    cudaStatus = cudaMemcpyToSymbol(dev_state, CipherState, 128*sizeof(unsigned char), 0, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf(" \n%s\n", cudaGetErrorString(cudaStatus));
        getchar();
    }

    printf("\n initialized:\n 0x");
    for (size_t i = 0; i < 16; i+=16)
    {
        if (i % 16 == 0)
            printf("\n0x");
        for (int j =0 ; j <=15; j++)
        {
            printf("%x", CipherState[i+j]);
        }
    }
    // set all of the dev_state to "0x05"
    Kernel << <8, 16 >> >();

//  until this line, everythings OK

unsigned char CipherState2[128];    
cudaStatus = cudaMemcpyFromSymbol(CipherState2, dev_state, 128*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) 
{
    printf(" \n%s\n", cudaGetErrorString(cudaStatus));
    getchar();
}


    printf("\n State at the end:\n ");
    for (size_t i = 0; i < 16; i+=16)
    {
        if (i % 16 == 0)
            printf("\n0x");
        for (int j = 0; j <= 15; j++)
            printf("%x",  CipherState2[i + j]);

    }
  }

sometimes , printing the cipherstate2 get this :

0x55555555555555555......5555555555

and sometimes:

0x11111111111111111.....11111111111;

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
Rezaeimh7
  • 1,467
  • 2
  • 23
  • 40
  • 1. Don't post additional information as an answer. Instead, you can edit your own question to provide more information. Click the edit link just below the `cuda` tag above. 2. You still haven't provided a complete code. You should provide a complete code - it is missing the `Kernel` for example. 3. The code you have posted doesn't have any problems, so the remaining problems may be in your kernel itself, or in the machine setup. 4. You should use [proper cuda error checking](http://stackoverflow.com/questions/14038589) and also run your code with `cuda-memcheck`. – Robert Crovella Nov 02 '15 at 18:33

1 Answers1

2

This is incorrect:

unsigned char CipherState2[50];

cudaStatus = cudaMemcpyFromSymbol(&CipherState2, dev_state, 50*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);
                                  ^

CipherState2 is already a pointer. You should not be taking the address of it. Instead you should do the call like this:

cudaStatus = cudaMemcpyFromSymbol(CipherState2, dev_state, 50*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);

And although you haven't shown what the CipherState variable looks like, it's quite possible you made a similar error here:

cudaStatus = cudaMemcpyToSymbol(dev_state, &CipherState, statesize, 0, cudaMemcpyHostToDevice);
                                           ^

It's quite possible the correct form of that call would be:

cudaStatus = cudaMemcpyToSymbol(dev_state, CipherState, statesize, 0, cudaMemcpyHostToDevice);

In the future, please provide an MCVE for questions like this.

As an example, note that this is not valid code:

__device__ array[50];

Perhaps you meant something like this:

__device__ unsigned char dev_state[50];

EDIT: the code you have now posted (in an answer) is still incomplete, but it appears to be mostly correct. The remaining problem may be in your kernel which you haven't shown, or it's possible your CUDA install is not working correctly. Here's a completely worked code around what you have shown (I added a simple kernel) that demonstrates expected behavior (note that your for-loops for printout are not constructed correctly, I don't think):

$ cat t966.cu
#include <stdio.h>
//before Kernel Function body

__device__ unsigned char dev_state[128];

//////////////////////////////////////

__global__ void Kernel(){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < 128) dev_state[idx] = 0x5;
}

void test()
{

    unsigned char CipherState[128];

    for (int i = 0; i<128; i++)
        CipherState[i] = 0x01;

    cudaError_t cudaStatus;

    cudaStatus = cudaMemcpyToSymbol(dev_state, CipherState, 128*sizeof(unsigned char), 0, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf(" \n%s\n", cudaGetErrorString(cudaStatus));
        getchar();
    }

    printf("\n initialized:\n 0x");
    for (size_t i = 0; i < 16; i+=16)
    {
        if (i % 16 == 0)
            printf("\n0x");
        for (int j =0 ; j <=15; j++)
        {
            printf("%x", CipherState[i+j]);
        }
    }
    // set all of the dev_state to "0x05"
    Kernel << <8, 16 >> >();

//  until this line, everythings OK

unsigned char CipherState2[128];
cudaStatus = cudaMemcpyFromSymbol(CipherState2, dev_state, 128*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
    printf(" \n%s\n", cudaGetErrorString(cudaStatus));
    getchar();
}


    printf("\n State at the end:\n ");
    for (size_t i = 0; i < 16; i+=16)
    {
        if (i % 16 == 0)
            printf("\n0x");
        for (int j = 0; j <= 15; j++)
            printf("%x",  CipherState2[i + j]);

    }
  printf("\n");
}
int main(){

  test();
}
$ nvcc t966.cu -o t966
$ cuda-memcheck ./t966
========= CUDA-MEMCHECK

 initialized:
 0x
0x1111111111111111
 State at the end:

0x5555555555555555
========= ERROR SUMMARY: 0 errors
$
Community
  • 1
  • 1
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257