5

I'm using CUDA with cuBLAS to perform matrix operations.

I need to sum the rows (or columns) of a matrix. Currently I'm doing it by multiplying the matrix with a ones vector but this doesn't seem so efficient.

Is there any better way? Couldn't find anything in cuBLAS.

Mateusz Piotrowski
  • 8,029
  • 10
  • 53
  • 79
Ran
  • 4,117
  • 4
  • 44
  • 70
  • 1
    http://stackoverflow.com/questions/3312228/cuda-add-rows-of-a-matrix might help. However, if you only need it "sometimes", i.e. it doesn't take up a significant percentage of your run time, I'd say that your method is perfectly acceptable, even though it incurrs the overhead of all those extra multiplications... – us2012 Jan 10 '13 at 15:13
  • But anyways, this is quite an easy kernel to implement yourself. Have a look at the example from this CalState presentation on CUDA: http://www.calstatela.edu/faculty/rpamula/cs370/GPUProgramming.pptx – us2012 Jan 10 '13 at 15:16
  • "sometimes" wasn't a good word. I do it as a part of training a neural network so it runs iteratively many times. The example code in the ppts doesn't work.. (the parameter is a pointer and it tries to access it like a 2D array). – Ran Jan 10 '13 at 15:43
  • 1
    It's not that hard to modify though, right? It depends on the layout of your matrix in memory anyways, depending on whether you have row-major or column-major storage and whether you use padding. – us2012 Jan 10 '13 at 15:58
  • Compared to the matrix-matrix multiplication operation, row-sum takes only tiny portion of time of training neural networks. You can ignore it I think. – kangshiyin Jan 10 '13 at 18:22

2 Answers2

6

Actually multiplying the matrix with a ones vector using cublas_gemv() is a very efficient way, unless you are considering write your own kernel by hand.

You can easily profile the mem bandwidth of cublas_gemv(). It's very close to that of simply reading the whole matrix data once, which can be seen as the theoretical peak performance of matrix row/col summation.

The extra operation "x1.0" won't lead to much performance reduction because:

  1. cublas_gemv() is basically a mem bandwidth bound operation, extra arithmetic instructions won't be the bottleneck;
  2. FMA instruction further reduce the instruction throughput;
  3. mem of ones vector is usually much smaller than that of the matrix, and can be easily cache by GPU to reduce to mem bandwidth.

cublas_gemv() also help you deal with the matrix layout problem. It works on row/col-major and arbitrary padding.

I also asked a similar question about this. My experiment shows cublas_gemv() is better than segmented reduce using Thrust::reduce_by_key, which is another approach of matrix row summation.

Community
  • 1
  • 1
kangshiyin
  • 9,681
  • 1
  • 17
  • 29
  • Makes sense. I was using gemm for that, for some reason. :) thanks. – Ran Jan 10 '13 at 19:31
  • @Ran, my test shows `cublas_gemv` is 2x faster than `cublas_gemm` for this operation. the size of the test matrix is 3000 x 3000. – kangshiyin Jan 13 '13 at 08:53
1

Posts related to this one containing useful answers on the same topic are available at

Reduce matrix rows with CUDA

and

Reduce matrix columns with CUDA.

Here I just want to point out how the approach of reducing the columns of a matrix through the multiplication of a row by the same matrix can be generalized to perform the linear combination of an ensemble of vectors. In other words, if one wants to calculate the following vector basis expansion

enter image description here

where f(x_m) are samples of the function f(x), while the \psi_n's are basis functions and the c_n's are expansion coefficients, then the \psi_n's can be organized in a N x M matrix and the coefficients c_n's in a row vector and then compute the vector x matrix multiplication using cublas<t>gemv.

Below, I'm reporting a fully worked example:

#include <cublas_v2.h>

#include <thrust/device_vector.h>
#include <thrust/random.h>

#include <stdio.h>
#include <iostream>

#include "Utilities.cuh"

/********************************************/
/* LINEAR COMBINATION FUNCTION - FLOAT CASE */
/********************************************/
void linearCombination(const float * __restrict__ d_coeff, const float * __restrict__ d_basis_functions_real, float * __restrict__ d_linear_combination,
                       const int N_basis_functions, const int N_sampling_points, const cublasHandle_t handle) {

    float alpha = 1.f;
    float beta  = 0.f;
    cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, N_sampling_points, N_basis_functions, &alpha, d_basis_functions_real, N_sampling_points, 
                               d_coeff, 1, &beta, d_linear_combination, 1));

}

void linearCombination(const double * __restrict__ d_coeff, const double * __restrict__ d_basis_functions_real, double * __restrict__ d_linear_combination,
                       const int N_basis_functions, const int N_sampling_points, const cublasHandle_t handle) {

    double alpha = 1.;
    double beta  = 0.;
    cublasSafeCall(cublasDgemv(handle, CUBLAS_OP_N, N_sampling_points, N_basis_functions, &alpha, d_basis_functions_real, N_sampling_points, 
                               d_coeff, 1, &beta, d_linear_combination, 1));

}

/********/
/* MAIN */
/********/
int main()
{
    const int N_basis_functions = 5;     // --- Number of rows                  -> Number of basis functions
    const int N_sampling_points = 8;     // --- Number of columns               -> Number of sampling points of the basis functions

    // --- Random uniform integer distribution between 10 and 99
    thrust::default_random_engine rng;
    thrust::uniform_int_distribution<int> dist(10, 99);

    // --- Matrix allocation and initialization
    thrust::device_vector<float> d_basis_functions_real(N_basis_functions * N_sampling_points);
    for (size_t i = 0; i < d_basis_functions_real.size(); i++) d_basis_functions_real[i] = (float)dist(rng);

    thrust::device_vector<double> d_basis_functions_double_real(N_basis_functions * N_sampling_points);
    for (size_t i = 0; i < d_basis_functions_double_real.size(); i++) d_basis_functions_double_real[i] = (double)dist(rng);

    /************************************/
    /* COMPUTING THE LINEAR COMBINATION */
    /************************************/
    cublasHandle_t handle;
    cublasSafeCall(cublasCreate(&handle));

    thrust::device_vector<float>  d_linear_combination_real(N_sampling_points);
    thrust::device_vector<double> d_linear_combination_double_real(N_sampling_points);
    thrust::device_vector<float>  d_coeff_real(N_basis_functions, 1.f);
    thrust::device_vector<double> d_coeff_double_real(N_basis_functions, 1.);

    linearCombination(thrust::raw_pointer_cast(d_coeff_real.data()), thrust::raw_pointer_cast(d_basis_functions_real.data()), thrust::raw_pointer_cast(d_linear_combination_real.data()),
                      N_basis_functions, N_sampling_points, handle);
    linearCombination(thrust::raw_pointer_cast(d_coeff_double_real.data()), thrust::raw_pointer_cast(d_basis_functions_double_real.data()), thrust::raw_pointer_cast(d_linear_combination_double_real.data()),
                      N_basis_functions, N_sampling_points, handle);

    /*************************/
    /* DISPLAYING THE RESULT */
    /*************************/
    std::cout << "Real case \n\n";
    for(int j = 0; j < N_sampling_points; j++) {
        std::cout << "Column " << j << " - [ ";
        for(int i = 0; i < N_basis_functions; i++)
            std::cout << d_basis_functions_real[i * N_sampling_points + j] << " ";
        std::cout << "] = " << d_linear_combination_real[j] << "\n";
    }

    std::cout << "\n\nDouble real case \n\n";
    for(int j = 0; j < N_sampling_points; j++) {
        std::cout << "Column " << j << " - [ ";
        for(int i = 0; i < N_basis_functions; i++)
            std::cout << d_basis_functions_double_real[i * N_sampling_points + j] << " ";
        std::cout << "] = " << d_linear_combination_double_real[j] << "\n";
    }

    return 0;
}
Community
  • 1
  • 1
Vitality
  • 20,705
  • 4
  • 108
  • 146