0

I have a structure containing some variables and some pointer variables. I want to copy this structure fro host to device in 2 different functions. In first function I have to copy entire structure except one pointer variable and then in second function I have to copy that remaining pointer.

I am able to copy entire structure but unable to copy the remaining pointer variable in the second function.

#include<iostream>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

struct MultiSGDKernelParam {
  int count;
  size_t sizes;
  float *weights;
  float *mom; 
  float lrs;
};


__global__ void Launch(MultiSGDKernelParam *param, int N, MultiSGDKernelParam *result)
{
  for(int i=0; i<N; i++)
  {
     result[i] =param[i];     
  }
}

MultiSGDKernelParam *fillStructure(float *temp, const int N)
{       
    MultiSGDKernelParam *param;
        param = (MultiSGDKernelParam*) malloc( N * sizeof(MultiSGDKernelParam));
        for( int i=0; i< N ; i++)
        {
            param[i].count = i;
            param[i].sizes =  i*2;
            param[i].lrs =  param[i].sizes - i;
            param[i].weights = &temp[i];
        }

    std::cout<<"Inside the function"<<"\n"; 
        for(int i=0; i< N; i++)
        {
                std::cout<<param[i].sizes<<" ,"<<param[i].lrs<<"\t";
        }

    std::cout<<std::endl;   
        for(int i =0 ; i<N;i++)
        {
          std::cout<<*(param[i].weights)<<"\t";

        }
        std::cout<<std::endl;
    MultiSGDKernelParam *d_param;
        cudaMalloc((void**)&d_param, N  * sizeof(MultiSGDKernelParam));
        cudaMemcpy(d_param,param,N  * sizeof(MultiSGDKernelParam),cudaMemcpyHostToDevice);

    return d_param;

}

MultiSGDKernelParam * fillFullStructure(float *tweight, float *tmom,  const int N )
{
  MultiSGDKernelParam *param = fillStructure( tweight, N );

 /* float *d_mom;

   cudaMalloc((void**)&d_mom,N*sizeof(float));
   cudaCheckErrors("cudaMalloc1 fail");
   cudaMemcpy(d_mom,tmom,N*sizeof(float), cudaMemcpyHostToDevice);
   cudaCheckErrors("cudaMemcpy1 fail");*/
   for( int i=0; i< N ; i++)
        {
          cudaMemcpy(&(param[i].mom),&(tmom[i]),sizeof(float), cudaMemcpyHostToDevice);
      cudaCheckErrors("cudaMempcpy2 fail");
        }

    std::cout<<"Momentum Values copied"<<"\n";
   /*cudaMemcpy(&(param->mom),tmom,N*sizeof(float), cudaMemcpyHostToDevice);
   cudaCheckErrors("cudaMempcpy1fail");*/
   return param;
}



int main()
{
    static const  int N =5;
    float tempweight [N], tempmom[N] ;
    for(int i=0; i< N; i++)
    {
            tempweight[i] = i*3 +1;
        tempmom[i] = i+3;
    }

    MultiSGDKernelParam *result;
    MultiSGDKernelParam *param = fillFullStructure( tempweight,tempmom, N ); 
     const unsigned blocks = 1;
         const unsigned threadsPerBlock = 4;
    cudaMalloc(&result, N  * sizeof(MultiSGDKernelParam));
    Launch<<<blocks,threadsPerBlock>>>(param, N, result);
        cudaDeviceSynchronize();
    MultiSGDKernelParam *paramresult;
    paramresult = (MultiSGDKernelParam*) malloc( N * sizeof(MultiSGDKernelParam));
    cudaMemcpy(paramresult,result, N * sizeof(MultiSGDKernelParam),cudaMemcpyDeviceToHost);
    std::cout<<"Inside Main"<<"\n";
    for(int i=0; i< N; i++)
        {
           std::cout<<paramresult[i].sizes<<" ,"<<paramresult[i].lrs<<"\t";
        }
    std::cout<<std::endl;
    for(int i =0 ; i<N;i++)
    { 
          std::cout<<*(paramresult[i].weights)<<"\t";
          std::cout<<*(paramresult[i].mom)<<"\t";
    }
         std::cout<<std::endl;

    return 0;
}

The output gives as

Inside the function    
0 ,0    2 ,1    4 ,2    6 ,3    8 ,4    
1   4   7   10  13  
Momentum Values copied
Inside Main
0 ,0    2 ,1    4 ,2    6 ,3    8 ,4    
Segmentation fault (core dumped)

I the code got compiled but giving the segmentation error while printing values.Is the copying is success If not what was the problem.

  • You have tagged this with CUDA, but I see no CUDA code or APIs anywhere in your code – talonmies Oct 13 '19 at 10:13
  • Re-posted the code with cuda . – Poornachandra Oct 13 '19 at 11:26
  • 2
    The runtime error comes from trying to access `param->mom` which is a device pointer and invalid in host code. Beyond that, it is hard to tell you what to fix because the whole idea behind your copy code is broken – talonmies Oct 13 '19 at 18:20
  • you may want to study [this](https://stackoverflow.com/questions/15431365/cudamemcpy-segmentation-fault/15435592#15435592) – Robert Crovella Oct 13 '19 at 18:49
  • As per the suggestions and provided link , I updated the code and posted the result .But still no luck. I am unable to identify the solution. Please help me to resolve the issue. Please provide the solution with full code for best understanding. Thanks in advance. – Poornachandra Oct 14 '19 at 13:55

1 Answers1

1
  • I don't recommend writing CUDA kernels like this:

    __global__ void Launch(MultiSGDKernelParam *param, int N, MultiSGDKernelParam *result)
    {
      for(int i=0; i<N; i++)
      {
         result[i] =param[i];     
      }
    }
    

    Even if it is only for demonstration, you should do one of 2 things: either write the kernel like that (with no specialization for CUDA threads) and only launch 1 block of 1 thread (then it is obvious this is just for demonstration) or else use proper CUDA thread indexing (e.g. int i = threadIdx.x+blockDim.x*blockIdx.x;) and get rid of the for-loop, and launch your block with multiple threads. As it stands you have done neither of those. You have an ordinary for-loop with no specialization, running in multiple threads. Sure, this isn't the focus of your question, perhaps, but this behavior you have now means that threads will be stepping on each other as they attempt to write to result[i]. Even if all the rest of your code is correct, that could obscure understanding whether or not things are functioning correctly. We will fix this by switching your launch configuration to <<<1,1>>>

  • This:

    param[i].weights = &temp[i];
    

    cannot be correct. You are setting a pointer inside the structure to point to something that is in host memory. (The temp item here is pointing to your tempweight host array.) Such a pointer cannot be usable in any way in device code. This is a fundamental CUDA principle. When you copy that structure to the device, the numerical value of that pointer won't be changed in any way, meaning it is still pointing to host memory. If you intend to use this pointer at any point in device code, you are going to have to learn how to work through a CUDA deep copy operation. And this answer walks through that step-by-step. As it happens, you are not actually attempting to dereference that pointer in device code - you are merely copying structures from one place to another. So we don't need to delve into that further, to get the device code you have shown working correctly.

  • The proximal reason for the seg fault is that you have not initialized the mom structure member anywhere in your code, but you are attempting to dereference it here:

    std::cout<<*(paramresult[i].mom)<<"\t";
    

    In C or C++ if you attempt to dereference a pointer that you have not initialized, bad things will probably happen. We can fix this by commenting out that line of code. We could also "fix" it by copying just the numerical pointer value from the weights structure member to the mom structure member, in device code. However we can't use those pointers directly in device code, because they are host pointers as indicated above.

The following code has the first and third items above addressed. It appears to run correctly for me.

$ cat t1529.cu
#include<iostream>
#include <stdio.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

struct MultiSGDKernelParam {
  int count;
  size_t sizes;
  float *weights;
  float *mom;
  float lrs;
};


__global__ void Launch(MultiSGDKernelParam *param, int N, MultiSGDKernelParam *result)
{
  for(int i=0; i<N; i++)
  {
     result[i] =param[i];
  }
}

MultiSGDKernelParam *fillStructure(float *temp, const int N)
{
    MultiSGDKernelParam *param;
        param = (MultiSGDKernelParam*) malloc( N * sizeof(MultiSGDKernelParam));
        for( int i=0; i< N ; i++)
        {
            param[i].count = i;
            param[i].sizes =  i*2;
            param[i].lrs =  param[i].sizes - i;
            param[i].weights = &temp[i];
        }

    std::cout<<"Inside the function"<<"\n";
        for(int i=0; i< N; i++)
        {
                std::cout<<param[i].sizes<<" ,"<<param[i].lrs<<"\t";
        }

    std::cout<<std::endl;
        for(int i =0 ; i<N;i++)
        {
          std::cout<<*(param[i].weights)<<"\t";

        }
        std::cout<<std::endl;
    MultiSGDKernelParam *d_param;
        cudaMalloc((void**)&d_param, N  * sizeof(MultiSGDKernelParam));
        cudaMemcpy(d_param,param,N  * sizeof(MultiSGDKernelParam),cudaMemcpyHostToDevice);

    return d_param;

}

MultiSGDKernelParam * fillFullStructure(float *tweight, float *tmom,  const int N )
{
  MultiSGDKernelParam *param = fillStructure( tweight, N );

 /* float *d_mom;

   cudaMalloc((void**)&d_mom,N*sizeof(float));
   cudaCheckErrors("cudaMalloc1 fail");
   cudaMemcpy(d_mom,tmom,N*sizeof(float), cudaMemcpyHostToDevice);
   cudaCheckErrors("cudaMemcpy1 fail");*/
   for( int i=0; i< N ; i++)
        {
          cudaMemcpy(&(param[i].mom),&(tmom[i]),sizeof(float), cudaMemcpyHostToDevice);
      cudaCheckErrors("cudaMempcpy2 fail");
        }

    std::cout<<"Momentum Values copied"<<"\n";
   /*cudaMemcpy(&(param->mom),tmom,N*sizeof(float), cudaMemcpyHostToDevice);
   cudaCheckErrors("cudaMempcpy1fail");*/
   return param;
}



int main()
{
    static const  int N =5;
    float tempweight [N], tempmom[N] ;
    for(int i=0; i< N; i++)
    {
            tempweight[i] = i*3 +1;
        tempmom[i] = i+3;
    }

    MultiSGDKernelParam *result;
    MultiSGDKernelParam *param = fillFullStructure( tempweight,tempmom, N );
    const unsigned blocks = 1;
    const unsigned threadsPerBlock = 1;
    cudaMalloc(&result, N  * sizeof(MultiSGDKernelParam));
    Launch<<<blocks,threadsPerBlock>>>(param, N, result);
    cudaDeviceSynchronize();
    MultiSGDKernelParam *paramresult;
    paramresult = (MultiSGDKernelParam*) malloc( N * sizeof(MultiSGDKernelParam));
    cudaMemcpy(paramresult,result, N * sizeof(MultiSGDKernelParam),cudaMemcpyDeviceToHost);
    std::cout<<"Inside Main"<<"\n";
    for(int i=0; i< N; i++)
        {
           std::cout<<paramresult[i].sizes<<" ,"<<paramresult[i].lrs<<"\t";
        }
    std::cout<<std::endl;
    for(int i =0 ; i<N;i++)
    {
          std::cout<<*(paramresult[i].weights)<<"\t";
        //  std::cout<<*(paramresult[i].mom)<<"\t";
    }
         std::cout<<std::endl;

    return 0;
}

$ nvcc -o t1529 t1529.cu
$ cuda-memcheck ./t1529
========= CUDA-MEMCHECK
Inside the function
0 ,0    2 ,1    4 ,2    6 ,3    8 ,4
1       4       7       10      13
Momentum Values copied
Inside Main
0 ,0    2 ,1    4 ,2    6 ,3    8 ,4
1       4       7       10      13
========= ERROR SUMMARY: 0 errors
$

If you want to actually use the weights and mom structure members (pointers) in device code, you will need to start trying to understand a deep copy operation in CUDA. I've already given you the link that spells out the process, step by step, with a worked example. Right now you've shown no indication in your code that you have implemented any of that, and writing the code for you is beyond the scope of what I intend to answer here, since you've made no attempt at it.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • Thanks for your valuable answer. But you mentioned that I was not initialized the mom values. this was happened in side fillFullStructure function with the following code cudaMemcpy(&(param[i].mom),&(tmom[i]),sizeof(float), cudaMemcpyHostToDevice); But the copying is not happening properly and this may lead to segfault. I will refer the line you shared and try to correct the error. Thanks once again. – Poornachandra Oct 15 '19 at 05:58
  • very well, then that line of code is broken also, and needs to be changed to something like `cudaMemcpy(&(param[i].mom),&tmom,sizeof(float *), cudaMemcpyHostToDevice);` – Robert Crovella Oct 15 '19 at 13:49