I am teaching myself some CUDA and I am trying to implement a simple softmax operation on a 2D tensor of size N x M
along the second dimension. I have managed to write a naive solution that works when the column tile size is equal to my column dimension (i.e., when TILE_DIM_X = M
). However, my function breaks when TILE_DIM_X < M
. This makes sense since the addition is only carried on the tile and not on the full row.
Now, how can I address this issue to account for higher values of M
? Furthermore, how can I carry this reduction more efficiently? I've been reading on parallel reductions and CUB, and I've been trying to implement a solution that makes use of cub::BlockReduce()
but no luck so far.
This is just a tiny part of my kernel, so I am trying to "modularize" operations as much as possible to make this somewhat readable. As such, I am not interested in already-exisiting fused kernels for softmax.
#include <cuda_runtime.h>
#include <iostream>
#include <random>
const int TILE_DIM_Y = 32; // Tile dimension for rows
const int TILE_DIM_X = 32; // Tile dimension for columns
__global__ void softmaxKernel2D(const float* input, float* output, int N, int M) {
int row = blockIdx.y * TILE_DIM_Y + threadIdx.y;
int col = blockIdx.x * TILE_DIM_X + threadIdx.x;
__shared__ float tile[TILE_DIM_Y][TILE_DIM_X];
__shared__ float exp_sums[TILE_DIM_Y];
// Copy data from global memory to shared memory
if (row < N && col < M) {
tile[threadIdx.y][threadIdx.x] = input[row * M + col];
}
// Synchronize
__syncthreads();
// Calculate the sum of exponentials for each row
float exp_val = expf(tile[threadIdx.y][threadIdx.x]);
atomicAdd(&exp_sums[threadIdx.y], exp_val);
// Synchronize
__syncthreads();
// Compute the softmax values
if (row < N && col < M) {
output[row * M + col] = exp_val / exp_sums[threadIdx.y];
}
}
void randomInit(std::vector<float> &x) {
// Pseudo-random float vector
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> unif(-1, 1);
for (int i = 0; i < x.size(); i++) {
x[i] = unif(gen);
}
}
bool verifyOutput(const float* output, int N, int M) {
for (int i = 0; i < N; i++) {
float softmax_val = 0;
for (int j = 0; j < M; j++) {
softmax_val += output[i * M + j];
}
// Check if row sums to one
if (fabs(softmax_val - 1) > 1e-6) {
std::cout << softmax_val << std::endl;
return false;
}
}
return true;
}
int main() {
int N = 2048;
int M = 32;
std::vector<float> h_input(N * M);
std::vector<float> h_output(N * M);
float *d_input, *d_output;
// Randomly initialize input
randomInit(h_input);
// Allocate memory on the device
cudaMalloc(&d_input, N * M * sizeof(float));
cudaMalloc(&d_output, N * M * sizeof(float));
// Copy to device
cudaMemcpy(d_input, h_input.data(), N * M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_output, h_output.data(), N * M * sizeof(float), cudaMemcpyHostToDevice);
// Define block and grid dimensions
dim3 threads(TILE_DIM_X, TILE_DIM_Y);
dim3 blocks((M + TILE_DIM_X - 1) / TILE_DIM_X, (N + TILE_DIM_Y - 1) / TILE_DIM_Y);
int shmem_size = TILE_DIM_X * TILE_DIM_Y * sizeof(float);
// Launch the kernel
softmaxKernel2D<<<blocks, threads, shmem_size>>>(d_input, d_output, N, M);
// Copy to host
cudaMemcpy(h_output.data(), d_output, N * M * sizeof(float), cudaMemcpyDeviceToHost);
// Check
if (verifyOutput(h_output.data(), N, M)) {
std::cout << "Success!" << std::endl;
} else {
std::cout << "Failure!" << std::endl;
}
// Clean up
cudaFree(d_input);
cudaFree(d_output);
return 0;
}