I am puzzled by the following program (code below). It works fine and gives correct results when the two lines in the kernel defining specsin
and speccos
are given by (note the second term, which is sin(t)
):
specsin+=sin(pi*t/my_tau)*sin(t)*sin(my_omega*(t+my_a0*my_a0/4.0/pi*(2.0*pi*t-my_tau*sin(2*pi*t/my_tau))));
speccos+=sin(pi*t/my_tau)*sin(t)*cos(my_omega*(t+my_a0*my_a0/4.0/pi*(2.0*pi*t-my_tau*sin(2*pi*t/my_tau))));
Once I change this second sin(t)
term to sin(t+0.0*my_a0*my_a0)
, which shouldn't change the result, I get all zeros instead of correct answer.
Can it be that I ran out of kernel memory?
#include <stdio.h>
__global__ void Calculate_Spectrum(float * d_Detector_Data, int numCols, int numRows,
const float omega_min, float dOmega,
const float a0_min, float da0,
const float tau_min, float dtau, float dt)
{
int Global_x = blockIdx.x * blockDim.x + threadIdx.x;
int Global_y = blockIdx.y * blockDim.y + threadIdx.y;
int Position1D = Global_y * numCols + Global_x;
float my_omega=omega_min + Global_x * dOmega;
float my_a0=a0_min + Global_y*da0;
float my_tau=tau_min;
int total_time_steps=int(my_tau/dt);
float specsin=0.0;
float speccos=0.0;
float t=0.0;
float pi=3.14159265359;
for(int n=0; n<total_time_steps; n++)
{
t=n*dt;
specsin+=sin(pi*t/my_tau)*sin(t+0.0*my_a0*my_a0)*sin(my_omega*(t+my_a0*my_a0/4.0/pi*(2.0*pi*t-my_tau*sin(2*pi*t/my_tau))));
speccos+=sin(pi*t/my_tau)*sin(t+0.0*my_a0*my_a0)*cos(my_omega*(t+my_a0*my_a0/4.0/pi*(2.0*pi*t-my_tau*sin(2*pi*t/my_tau))));
}
d_Detector_Data[Position1D]=(specsin*specsin+speccos*speccos)*dt*dt*my_a0*my_a0*my_omega*my_omega/4.0/pi/pi;
}
int main(int argc, char ** argv)
{
const int omega_bins = 1024;
const int a0_bins = 512;
const int tau_bins = 1;
const float omega_min = 0.5;
const float omega_max = 1.1;
const float a0_min = 0.05;
const float a0_max = 1.0;
const float tau_min = 1200;
const float tau_max = 600;
const int steps_per_period=20; // for integrating
float dt=1.0/steps_per_period;
int TotalSize = omega_bins * a0_bins * tau_bins;
float dOmega=(omega_max-omega_min)/(omega_bins-1);
float da0=(a0_max-a0_min)/(a0_bins-1);
float dtau=0.;
float * d_Detector_Data;
int * d_Global_x;
int * d_Global_y;
float h_Detector_Data[TotalSize];
// allocate GPU memory
cudaMalloc((void **) &d_Detector_Data, TotalSize*sizeof(float));
Calculate_Spectrum<<<dim3(1,a0_bins,1), dim3(omega_bins,1,1)>>>(d_Detector_Data, omega_bins, a0_bins, omega_min, dOmega, a0_min, da0, tau_min, dtau, dt);
cudaMemcpy(h_Detector_Data, d_Detector_Data, TotalSize*sizeof(float), cudaMemcpyDeviceToHost);
FILE * SaveFile;
char TempStr[255];
sprintf(TempStr, "result.dat");
SaveFile = fopen(TempStr, "w");
int counter=0;
for(int j=0; j<a0_bins;j++)
{
for(int i=0; i<omega_bins; i++)
{
fprintf(SaveFile,"%e\t", h_Detector_Data[counter]);
counter++;
}
fprintf(SaveFile, "\n");
}
fclose(SaveFile);
// free GPU memory
return 0;
}