I have tried to use a polynomial class in CUDA. The class definition is as follow:
template<int Degree>
class Polynomial{
public:
float coefficients[Degree+1];
};
template<int Degree>
class StartingPolynomial{
public:
Polynomial<Degree> p;
float start;
};
template<int Degree>
class PPolynomial{
public:
size_t polyCount;
StartingPolynomial<Degree>* polys;
};
The output of PPolynomial class is something like:
[-0.750000,-0.250000] 1.5000 x^0 +4.0000 x^1 +2.6667 x^2
[-0.250000,0.250000] 1.0000 x^0 +0.0000 x^1 -5.3333 x^2
[0.250000,0.750000] 1.5000 x^0 -4.0000 x^1 +2.6667 x^2
[0.750000,Infinity] 0.0000 x^0 +0.0000 x^1 +0.0000 x^2
I write a __device__ function try to change the data member in __global__ function. But I failed and got warned that is an illegal access.
ERROR: /home/davidxu/CLionProjects/Practice/main.cu:973,code:700,reason:an illegal memory access was encountered
I write a test program and I still can't figure out how to change the data member.
Program:
#include "cuda.h"
#include "cstdio"
#include "cuda_runtime.h"
template<int Degree>
class Polynomial{
public:
float coefficients[Degree+1];
};
template<int Degree>
class StartingPolynomial{
public:
Polynomial<Degree> p;
float start;
};
template<int Degree>
class PPolynomial{
public:
size_t polyCount;
StartingPolynomial<Degree>* polys;
};
template<int Degree>
__device__ void scale(PPolynomial<Degree> *pp,const float& scale){
for(int i=0;i<pp->polyCount;++i){
printf("change start\n");
printf("start is %f\n",pp->polys[i].start);
atomicExch(&pp->polys[i].start,scale*pp->polys[i].start);
printf("start ok\n");
float s2=1.0;
for(int j=0;j<=Degree;++j){
printf("change polys\n");
pp->polys[i].p.coefficients[j]*=s2;
printf("polys ok\n");
s2/=scale;
}
}
}
__global__ void test(PPolynomial<2> *pp){
scale(pp,0.5);
}
int main(){
PPolynomial<2> pp;
pp.polyCount=2;
pp.polys=(StartingPolynomial<2>*)malloc(sizeof(StartingPolynomial<2>)*pp.polyCount);
pp.polys[0].start=-1;
/* pp.polys.p.coefficients[0]=1; */
pp.polys[0].start=1;
/* pp.polys.p.coefficients[0]=2; */
PPolynomial<2> *pd=NULL;
cudaMalloc((PPolynomial<2>**)&pd,sizeof(pp));
cudaMemcpy(pd,&pp,sizeof(pp),cudaMemcpyHostToDevice);
test<<<1,1>>>(pd);
cudaDeviceSynchronize();
}
output:
change start
I try to pass the pram to __global__ function by input the address of device variable, but the kernel function seems to be dead at last. That makes me really confused.
How can I solve my problem?
Edit 1:
I find out that the pointer pd->polys points to host address(Although pd is a pointer to device address).Is there an elegant way to copy the whole PPolynomial object from host to device?