I am new to CUDA programming. I am performing a simple power iteration method to get the dominant eigen vector, but whenever I increase my matrix size to a value greater than the number of threads per block, I get a wrong answers. I am unable to understand why this is happening.
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>
#define N 100
#define THREADS_PER_BLOCK 256
__global__ void power_iteration(float *d_A, float *d_x , float *d_temp )
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
int iterations = 0;
float old = 0.0f;
float result = 0.0f;
float error =0;
float tolerance = 1e-6;
//for (int i = 0; i < iterations; ++i)
while(error>tolerance || iterations ==0)
{
iterations++;
printf("%d\n" ,iterations);
// Matrix-vector multiplication
if (idx < N)
{
//printf("%d\n" , idx);
float sum = 0.0f;
#pragma unroll
for (int j = 0; j < N; j++)
{
sum += d_A[idx * N + j] * d_x[j];
}
d_x[idx] = sum;
__syncthreads();
}
// normalize
if(idx<N)
d_x[idx] = d_x[idx] / d_x[N-1];
__syncthreads();
//calculate eigenvalue
if (idx == 0)
{
float numerator = 0.0f;
float denominator = 0.0f;
#pragma unroll
for (int i = 0; i < N; i++)
{
numerator += d_temp[i] * d_x[i];
denominator += d_x[i] * d_x[i];
}
result = numerator / denominator;
}
error = (float) abs(result - old);
old = result;
if(idx<N)
d_temp[idx] = d_x[idx];
}
}
int main() {
//declare host and device variable
float *h_A , *d_A ;
// allocate memory
cudaMalloc((void **)&d_A, N * N * sizeof(float));
h_A = (float *)malloc(N*N*sizeof(float));
//intialize host matrix
for(int i = 0 ; i<N ; i++)
{
for(int j =0 ; j< N ; j++)
//scanf("%f",&h_A[i*N+j]);
h_A[i*N+j] = (float) i/N;
}
//declare host and device variable
float *h_x , *d_x , *d_temp;
// allocate memory
cudaMalloc((void **)&d_x, N * sizeof(float));
cudaMalloc((void **)&d_temp, N * sizeof(float));
h_x = (float *)malloc(N*sizeof(float));
//storing intial guess
for (int i = 0; i < N; ++i) {
h_x[i] = 1.0;
}
// copy to device memory
cudaMemcpy(d_A, h_A, N*N*sizeof(float) , cudaMemcpyHostToDevice);
cudaMemcpy(d_x, h_x, N*sizeof(float) , cudaMemcpyHostToDevice);
cudaMemcpy(d_temp, h_x, N*sizeof(float) , cudaMemcpyHostToDevice);
// Launch the kernel
int numBlocks = (N+THREADS_PER_BLOCK-1) / (THREADS_PER_BLOCK);
int numThreads=(THREADS_PER_BLOCK);
power_iteration<<<numBlocks, numThreads>>>(d_A,d_x,d_temp);
// Synchronize and handle the result
cudaDeviceSynchronize();
cudaMemcpy(h_x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost);
for(int i = 0 ; i<N ; i++)
{
printf("%f ", h_x[i]);
}
printf("\n");
cudaFree(d_x);
cudaFree(d_temp);
free(h_x);
cudaFree(d_A);
free(h_A);
return 0;
}
Here I have performed matrix-vector multiplication and am running through the loop until the error is less than the tolerance. I get good results for N < THREADS_PER_BLOCK
but when I exceed it, I get wrong results.