Posts related to this one containing useful answers on the same topic are available at
Reduce matrix rows with CUDA
and
Reduce matrix columns with CUDA.
Here I just want to point out how the approach of reducing the columns of a matrix through the multiplication of a row by the same matrix can be generalized to perform the linear combination of an ensemble of vectors. In other words, if one wants to calculate the following vector basis expansion

where f(x_m)
are samples of the function f(x)
, while the \psi_n
's are basis functions and the c_n
's are expansion coefficients, then the \psi_n
's can be organized in a N x M
matrix and the coefficients c_n
's in a row vector and then compute the vector x matrix multiplication using cublas<t>gemv
.
Below, I'm reporting a fully worked example:
#include <cublas_v2.h>
#include <thrust/device_vector.h>
#include <thrust/random.h>
#include <stdio.h>
#include <iostream>
#include "Utilities.cuh"
/********************************************/
/* LINEAR COMBINATION FUNCTION - FLOAT CASE */
/********************************************/
void linearCombination(const float * __restrict__ d_coeff, const float * __restrict__ d_basis_functions_real, float * __restrict__ d_linear_combination,
const int N_basis_functions, const int N_sampling_points, const cublasHandle_t handle) {
float alpha = 1.f;
float beta = 0.f;
cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, N_sampling_points, N_basis_functions, &alpha, d_basis_functions_real, N_sampling_points,
d_coeff, 1, &beta, d_linear_combination, 1));
}
void linearCombination(const double * __restrict__ d_coeff, const double * __restrict__ d_basis_functions_real, double * __restrict__ d_linear_combination,
const int N_basis_functions, const int N_sampling_points, const cublasHandle_t handle) {
double alpha = 1.;
double beta = 0.;
cublasSafeCall(cublasDgemv(handle, CUBLAS_OP_N, N_sampling_points, N_basis_functions, &alpha, d_basis_functions_real, N_sampling_points,
d_coeff, 1, &beta, d_linear_combination, 1));
}
/********/
/* MAIN */
/********/
int main()
{
const int N_basis_functions = 5; // --- Number of rows -> Number of basis functions
const int N_sampling_points = 8; // --- Number of columns -> Number of sampling points of the basis functions
// --- Random uniform integer distribution between 10 and 99
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(10, 99);
// --- Matrix allocation and initialization
thrust::device_vector<float> d_basis_functions_real(N_basis_functions * N_sampling_points);
for (size_t i = 0; i < d_basis_functions_real.size(); i++) d_basis_functions_real[i] = (float)dist(rng);
thrust::device_vector<double> d_basis_functions_double_real(N_basis_functions * N_sampling_points);
for (size_t i = 0; i < d_basis_functions_double_real.size(); i++) d_basis_functions_double_real[i] = (double)dist(rng);
/************************************/
/* COMPUTING THE LINEAR COMBINATION */
/************************************/
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
thrust::device_vector<float> d_linear_combination_real(N_sampling_points);
thrust::device_vector<double> d_linear_combination_double_real(N_sampling_points);
thrust::device_vector<float> d_coeff_real(N_basis_functions, 1.f);
thrust::device_vector<double> d_coeff_double_real(N_basis_functions, 1.);
linearCombination(thrust::raw_pointer_cast(d_coeff_real.data()), thrust::raw_pointer_cast(d_basis_functions_real.data()), thrust::raw_pointer_cast(d_linear_combination_real.data()),
N_basis_functions, N_sampling_points, handle);
linearCombination(thrust::raw_pointer_cast(d_coeff_double_real.data()), thrust::raw_pointer_cast(d_basis_functions_double_real.data()), thrust::raw_pointer_cast(d_linear_combination_double_real.data()),
N_basis_functions, N_sampling_points, handle);
/*************************/
/* DISPLAYING THE RESULT */
/*************************/
std::cout << "Real case \n\n";
for(int j = 0; j < N_sampling_points; j++) {
std::cout << "Column " << j << " - [ ";
for(int i = 0; i < N_basis_functions; i++)
std::cout << d_basis_functions_real[i * N_sampling_points + j] << " ";
std::cout << "] = " << d_linear_combination_real[j] << "\n";
}
std::cout << "\n\nDouble real case \n\n";
for(int j = 0; j < N_sampling_points; j++) {
std::cout << "Column " << j << " - [ ";
for(int i = 0; i < N_basis_functions; i++)
std::cout << d_basis_functions_double_real[i * N_sampling_points + j] << " ";
std::cout << "] = " << d_linear_combination_double_real[j] << "\n";
}
return 0;
}