Why can I assign more than 1024 threads per block?

Question

Following code:

#include <iostream>
#include <chrono>
#include <random>
#include <cuda.h>
#include <cuda_runtime.h>

#define N 300000000

typedef std::chrono::high_resolution_clock Clock;
using namespace std;

__global__ void vector_add(float *out, float *a, float *b, int n) {
    int i = blockIdx.x;
    out[i] = a[i] + b[i];
    
}
void vector_add_cpu(float *out, float *a, float *b, int n) {
    for(int i = 0; i < n; i ++){
        out[i] = a[i] + b[i];
    }
}


int main(){
    srand(clock());

    float *a, *b, *out;

    float *d_a, *d_b, *d_out; 

    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);

    for(int i = 0; i < N; i++){
        a[i] = float(rand());
        b[i] = float(rand());
    }


    auto cpu_start = Clock::now();
    vector_add_cpu(out, a, b, N);
    auto cpu_end = Clock::now();
    long long cputime =std::chrono::duration_cast<std::chrono::nanoseconds>(cpu_end - cpu_start).count();
    std::cout << "vector_add_cpu: "<< cputime<< "ns.\n";


   

    cudaMalloc((void**)&d_a, sizeof(float) * N);
    cudaMalloc((void**)&d_b, sizeof(float) * N);
    cudaMalloc((void**)&d_out, sizeof(float) * N);

    
    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);




    auto gpu_start1 = Clock::now();
    vector_add<<<32,N>>>(d_out, d_a, d_b, N);
    auto gpu_end1 = Clock::now();
    long long best_gpu=std::chrono::duration_cast<std::chrono::nanoseconds>(gpu_end1 - gpu_start1).count();
    std::cout << "vector_add_gpu_finale_version: "<< best_gpu<< "ns.\n";

    cout<<"Gpu is "<<(float)cputime/(float)best_gpu<<" times faster";


    
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a); 
    free(b); 
    free(out);
}

I read everywhere on internet that 1024 threads is the max. number of threads per block and that I should get an error if I compile a code which assigns more than that. However my script works fine, and gpu calculations work much faster with this huge number of threads. Compiled with nvcc.

One of sources:https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications

Tactical note: `high_resolution_clock` is defined to prefer speed over everything else, including consistency. It makes for a poor timer. Prefer the monotonic clock guaranteed by `steady_clock`. [Details](https://stackoverflow.com/q/37426832/4581301) — user4581301, Sep 15 '22 at 17:40
Perhaps because not everything you read on the Internet might be true? — Sam Varshavchik, Sep 15 '22 at 17:43
Most of the code is copied from internet, I just wanted tp measure calculation speed difference bewteen cpu and gpu — max steuerman, Sep 15 '22 at 17:45
@SamVarshavchik I added a source from nvidia to the question — max steuerman, Sep 15 '22 at 17:48
1. You won't get any compilation error for threads per block, regardless of what you do. It doesn't work that way. The compiler doesn't check that. 2. If you do [proper CUDA error checking](https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) you will get a runtime error report, and even if you don't do proper CUDA error checking, your kernel will not actually run with that sort of error. — Robert Crovella, Sep 15 '22 at 18:50
Why do you need that many threads? Some threads may finish before others are started. — Thomas Matthews, Sep 15 '22 at 20:05
Some GPUs have a maximum of 1536 or of 2048 instead of 1024, but not such a huge number. See Robert's answer about errors. With errors your program runs much faster as it does nothing. — Sebastian, Sep 16 '22 at 05:41

Why can I assign more than 1024 threads per block?

0 Answers0