I encountered a problem with sin and cos in CUDA compute capability 2.0. It doesn't appear when the code is compiled for CUDA compute capability 1.x. I made a simple code. I test it in GeForce GTX 550 Ti and GeForce GTX 480, both had to the same result. This is the code:
#include <cufft.h>
#include <stdio.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#pragma once
#ifdef __INTELLISENSE__
void __syncthreads();
void atomicAdd(int*, int);
#endif
__global__ void cuftFrequency(float* in_data, float* out_data, int N, int M, int fromM = 1)
{
cuComplex s;
float t = 0;
for (int I = threadIdx.x + blockIdx.x * blockDim.x + fromM; I <= M; I += blockDim.x * gridDim.x)
{
s.x = 0;
s.y = 0;
for (int J = 0; J < N; J++)
{
t = (6.0 * (J - N / 2)) / I;
s.x += in_data[J] * cos(t);
s.y += in_data[J] * sin(t);
}
/************************* if no problem, array return values 500, else - same refuse
out_data[I - fromM] = 500;//s.x * s.x + s.y * s.y;
}
}
extern "C" __declspec(dllexport) void cuftColorQualifierExec(float* data, float *spm, int N, int M, int fromM)
{
float* in_data_dev;
float *furie_dev;
cudaDeviceProp prop;
int N_Dev;
memset(&prop, 0, sizeof(cudaDeviceProp));
prop.major = 2;
prop.minor = 0;
prop.maxThreadsPerBlock = M - fromM;
cudaChooseDevice(&N_Dev, &prop);
cudaSetDevice(N_Dev);
cudaGetDeviceProperties(&prop, N_Dev);
int N_thread = 576;
int N_block = 2;
int *Count_dev;
cudaError_t err = cudaMalloc((void**)&in_data_dev, sizeof(float) * N);
if (err != cudaSuccess)
fprintf(stderr, "ERROR \"%s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaMemcpy(in_data_dev, data, sizeof(float) * N, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
fprintf(stderr, "ERROR \"%s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaMalloc((void**)&furie_dev, sizeof(float) * (M - fromM + 1));
if (err != cudaSuccess)
fprintf(stderr, "ERROR \"%s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);
cuftFrequency<<<N_block, N_thread>>>(in_data_dev, furie_dev, N, M, fromM);
err = cudaDeviceSynchronize();
if (err != cudaSuccess)
fprintf(stderr, "ERROR \"%s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaMemcpy(spm, furie_dev, sizeof(float) * (M - fromM + 1), cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
fprintf(stderr, "ERROR \"%s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaFree(furie_dev);
if (err != cudaSuccess)
fprintf(stderr, "ERROR \"%s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}
int main()
{
int M = 1024, fromM = 1, N = 4000;
float* data = new float[4000];
float* spm = new float[M - fromM + 1];
for (int I = 0; I < N; I++)
data[I] = cos(6.0 * I);
for (int I = 0; I < M - fromM + 1; I++)
spm[I] = 0;
cuftColorQualifierExec(data, spm, N, M, fromM);
for (int I = 0; I < M - fromM + 1; I++)
fprintf(stdout, "%d: %f\n", I, spm[I]);
return 0;
}
When the number of threads is more than 576, this code don't work, and doesn't return errors. I specifically set 400 value to an array elements, to convince myself that the program reaches up to this point. When the program return correct value, please, change it and test again.
Why this code work correctly when I compile it with compute capability 1.x, but it not working correctly when compute capability 2.0?