Following code:
#include <iostream>
#include <chrono>
#include <random>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 300000000
typedef std::chrono::high_resolution_clock Clock;
using namespace std;
__global__ void vector_add(float *out, float *a, float *b, int n) {
int i = blockIdx.x;
out[i] = a[i] + b[i];
}
void vector_add_cpu(float *out, float *a, float *b, int n) {
for(int i = 0; i < n; i ++){
out[i] = a[i] + b[i];
}
}
int main(){
srand(clock());
float *a, *b, *out;
float *d_a, *d_b, *d_out;
a = (float*)malloc(sizeof(float) * N);
b = (float*)malloc(sizeof(float) * N);
out = (float*)malloc(sizeof(float) * N);
for(int i = 0; i < N; i++){
a[i] = float(rand());
b[i] = float(rand());
}
auto cpu_start = Clock::now();
vector_add_cpu(out, a, b, N);
auto cpu_end = Clock::now();
long long cputime =std::chrono::duration_cast<std::chrono::nanoseconds>(cpu_end - cpu_start).count();
std::cout << "vector_add_cpu: "<< cputime<< "ns.\n";
cudaMalloc((void**)&d_a, sizeof(float) * N);
cudaMalloc((void**)&d_b, sizeof(float) * N);
cudaMalloc((void**)&d_out, sizeof(float) * N);
cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
auto gpu_start1 = Clock::now();
vector_add<<<32,N>>>(d_out, d_a, d_b, N);
auto gpu_end1 = Clock::now();
long long best_gpu=std::chrono::duration_cast<std::chrono::nanoseconds>(gpu_end1 - gpu_start1).count();
std::cout << "vector_add_gpu_finale_version: "<< best_gpu<< "ns.\n";
cout<<"Gpu is "<<(float)cputime/(float)best_gpu<<" times faster";
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);
free(a);
free(b);
free(out);
}
I read everywhere on internet that 1024 threads is the max. number of threads per block and that I should get an error if I compile a code which assigns more than that. However my script works fine, and gpu calculations work much faster with this huge number of threads. Compiled with nvcc.
One of sources:https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications