Matrix-vector multiplication in CUDA: benchmarking & performance

Question

I'm updating my question with some new benchmarking results (I also reformulated the question to be more specific and I updated the code)...

I implemented a kernel for matrix-vector multiplication in CUDA C following the CUDA C Programming Guide using shared memory. Let me first present some benchmarking results which I did on a Jetson TK1 (GPU: Tegra K1, compute capability 3.2) and a comparison with cuBLAS:

Here I guess cuBLAS does some magic since it seems that its execution is not affected by the number of columns of A, which, in turn, implies that there is some sort of parallelisation along the columns of A.

Now, here is the source code of my kernel and a host function to call it (file: mv.cuh):

#include <cuda_runtime.h>

#define     BLOCK_SIZE      16

/* Set to __restric__ */
#define     RESTRICT

/**
 * Performs matrix-vector multiplication on the device.
 *
 * @param   dA              Address of matrix `A` on the device
 * @param   dx              Address of vector `x` on the device
 * @param   dev_ptr_y       Address of result y = A*x
 * @param   nRows           Number of rows of `A`
 * @param   nx              Size of `x` (number of columns of `A`)
 *
 * @tparam  T               Data type
 *
 */
template<typename T>
__global__ void matvec_kernel(
        const T * RESTRICT dA,
        const T * RESTRICT dx,
        T * RESTRICT dy,
        const unsigned int nRows,
        const unsigned int nx);


/**
 * Host-side wrapper for #matvec_kernel.
 *
 * @param   dA              Address of matrix `A` on the device
 * @param   dx              Address of vector `x` on the device
 * @param   dev_ptr_y       Address of result y = A*x
 * @param   nRows           Number of rows of `A`
 * @param   nx              Size of `x` (number of columns of `A`)
 * @param   elapsed_time    Time for the kernel to complete the execution in `ms`.
 *                          If NULL is passed to this argument, the elapsed time
 *                          will not be computed.
 *
 * @tparam  T               Data type for `A` and `x`
 */
template<typename T>
__host__ void matvec(
        const T * RESTRICT dA,
        const T * RESTRICT dx,
        T * RESTRICT dy,
        const unsigned int nRows,
        const unsigned int nx);



/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */

template<typename T>
__global__ void matvec_kernel(const T * RESTRICT  dA, const T * RESTRICT  dx,
        T * RESTRICT dy,
        const unsigned int nRows, const unsigned int nx)
{

        unsigned int bid = blockIdx.x;
        unsigned int row = threadIdx.x;
        const unsigned int block_size = blockDim.x;
        const unsigned int num_hor_blocks = ((nx + block_size - 1)/ block_size);
        unsigned int n_star;
        unsigned int idx_x;
        unsigned int idx_Asub;
        unsigned int idx_y;
        const T * Asub;
        const T * xsub;

        /* Only `x` is copied to shared memory */
        __shared__ T x_shared[BLOCK_SIZE];

        idx_y = bid * block_size;

        T * y_sub = dy + idx_y;

        T y_val = 0.0;

        #pragma unroll
        for (unsigned int m = 0; m < num_hor_blocks; ++m)
        {
            idx_Asub = block_size * (bid + m * nRows);
            idx_x = m * block_size;

            Asub = dA + idx_Asub;
            xsub = dx + idx_x;

            if (idx_x + row <  nx) {
                x_shared[row] = xsub[row];
            }

            __syncthreads();


        /* If the tiling is exact */
        if ( (nRows % block_size == 0 && nx % block_size == 0 ) ||
                (m != block_size - 1 || bid != gridDim.x - 1)) {
            y_val += Asub[row] * x_shared[0];
            y_val += Asub[row + nRows] * x_shared[1];
            y_val += Asub[row + 2 * nRows] * x_shared[2];
            y_val += Asub[row + 3 * nRows] * x_shared[3];
            y_val += Asub[row + 4 * nRows] * x_shared[4];
            y_val += Asub[row + 5 * nRows] * x_shared[5];
            y_val += Asub[row + 6 * nRows] * x_shared[6];
            y_val += Asub[row + 7 * nRows] * x_shared[7];
            y_val += Asub[row + 8 * nRows] * x_shared[8];
            y_val += Asub[row + 9 * nRows] * x_shared[9];
            y_val += Asub[row + 10 * nRows] * x_shared[10];
            y_val += Asub[row + 11 * nRows] * x_shared[11];
            y_val += Asub[row + 12 * nRows] * x_shared[12];
            y_val += Asub[row + 13 * nRows] * x_shared[13];
            y_val += Asub[row + 14 * nRows] * x_shared[14];
            y_val += Asub[row + 15 * nRows] * x_shared[15];
        } else {
            n_star = min(BLOCK_SIZE, nx - idx_x);
            #pragma unroll
            for (unsigned int e = 0; e < n_star; ++e) {
                y_val += Asub[row + e * nRows] * x_shared[e];
            }
        }
        __syncthreads();
        }

        if (row + idx_y < nRows)
            y_sub[row] = y_val;

}



template<typename T>
__host__ void matvec(
        const T * RESTRICT  dA,
        const T * RESTRICT  dx,
        T * RESTRICT dy,
        const unsigned int nRows,
        const unsigned int nx)
{
    dim3 dim_grid( (nRows + BLOCK_SIZE -1)/ BLOCK_SIZE );
    dim3 dim_block(BLOCK_SIZE);
    matvec_kernel<T> <<<dim_grid, dim_block>>>(dA, dx, dy, nRows, nx);
}

I'm using this to time my execution (file: cuda_timer.cuh):

#include <cuda_runtime.h>
#include "error_handles.cuh"

static cudaEvent_t start;
static cudaEvent_t stop;
static short timer_running = 0;
static short tic_called = 0;

/**
 * Sets up the timer.
 *
 * Must be called before any invocation to
 * tic() or toc(), preferrably at the beginning of your
 * application.
 */
void start_tictoc();

/**
 * Starts the timer.
 *
 * Use `toc()` to get the elapsed time; `tic()` must
 * be called before a `toc()`.
 */
void tic();

/**
 * Returns the elapsed time between its invocation
 * and a previous invocation of `toc()`. Returns `-1`
 * and prints a warning message if `toc()` was not
 * previously called. Returns `-2` and prints and error
 * message if `start_tictoc()` has not been called.
 *
 * @return Elapsed time between `tic()` and `toc()` in milliseconds
 * with a resolution of `0.5` microseconds.
 */
float toc();

/**
 * This function should be called when the
 * time will not be being used any more. It destroys
 * the events used to time CUDA kernels. If the timer
 * is not running, this function does nothing and
 * prints a warning message.
 */
void stop_tictoc();


void start_tictoc() {
    _CUDA(cudaEventCreate(&start));
    _CUDA(cudaEventCreate(&stop));
    timer_running = 1;
}

void tic() {
    if (timer_running) {
        _CUDA(cudaEventRecord(start, 0));
        tic_called = 1;
    } else {
        printf("WARNING: tic() called without a timer running!\n");
    }
}

float toc() {
    float elapsed_time;
    if (tic_called == 0) {
        printf("WARNING: toc() called without a previous tic()!\n");
        return -1;
    }
    if (timer_running == 1) {
        // _CUDA(cudaDeviceSynchronize()); // Removed! (See discussion below)
        _CUDA(cudaEventRecord(stop, 0));
        _CUDA(cudaEventSynchronize(stop));
        _CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));
        tic_called = 0;
        return elapsed_time;
    } else {
        printf("WARNING: toc() called without a timer running!\n");
        return -2;
    }

}

void stop_tictoc()
{
    if (timer_running == 1){
        _CUDA(cudaEventDestroy(start));
        _CUDA(cudaEventDestroy(stop));
        timer_running = 0;
    } else{
        printf("WARNING: stop_tictoc() called without a timer running!\n");
    }
}

and my main file (main.cu) is the following:

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <assert.h>
#include "cublas_v2.h"
#include <math.h>
#include <curand.h>
#include <stdbool.h>

#include "mv.cuh"
#include "cuda_timer.cuh"
#include "error_handles.cuh"

typedef float real_t;

#define _CUDA(x) do { if((x)!=cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

#define _CUBLAS(x) do { if((x) != CUBLAS_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

#define _CURAND(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

#define TEST_COLUMNS        1
#define TEST_ROWS           0

/**
 * If `TEST_WRT_` is set to `TEST_COLUMNS`, then a benchmark
 * will be performed with respect to columns (with a fixed
 * number of rows). If it is set to `TEST_ROWS`, then a benchmark will
 * run with respect to rows (fixed number of columns).
 */
#define TEST_WRT_ TEST_ROWS

#define CONSTANT_COLS 300
#define CONSTANT_ROWS 256

/**
 * In order to estimate the execution time, every
 * kernel is run `RUNS` times and the average is taken.
 */
#define RUNS 50

void compare_results(real_t *dev_y_cublas, real_t * dev_y,unsigned int nrows)
{
    real_t * hst_y_cublas;
    real_t * hst_y;
    const size_t s = nrows * sizeof(real_t);

    hst_y_cublas = (real_t*) malloc(s);
    hst_y = (real_t*) malloc(s);
    _CUDA(cudaMemcpy(hst_y, dev_y, s, cudaMemcpyDeviceToHost));
    _CUDA(cudaMemcpy(hst_y_cublas, dev_y_cublas, s, cudaMemcpyDeviceToHost));
    for (unsigned int i = 0; i < nrows; ++i) {
        if (fabsf(hst_y_cublas[i] - hst_y[i]) > 0.001) {
            printf("ERROR ------ %f\n", fabsf(hst_y_cublas[i] - hst_y[i]));
            exit(EXIT_FAILURE);
        }
    }
    if (hst_y_cublas) free(hst_y_cublas);
    if (hst_y) free(hst_y);
}


void do_benchmark() {
    curandGenerator_t gen;
    real_t *dev_rand_data = NULL; // Random data will be allocated here!
    real_t *dev_y = NULL;
    real_t *dev_y_cublas = NULL;
    real_t t;
    real_t t_cublas;
    const size_t n_rows_max = 1500;
    const size_t n_cols_max = 300;
    const size_t ntot = n_cols_max * (1 + n_rows_max);
    const size_t size_tot = sizeof(real_t) * ntot;

    float alpha = 1.0, beta = 0.0; // beta was initially set to 1.0 by mistake
    cublasHandle_t handle;
    _CUBLAS(cublasCreate(&handle));

    start_tictoc();

    _CUDA(cudaMalloc((void** )&dev_rand_data, size_tot));
    _CUDA(cudaMalloc((void** )&dev_y, n_rows_max * sizeof(real_t)));
    _CUDA(cudaMalloc((void** )&dev_y_cublas, n_rows_max * sizeof(real_t)));

    _CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
    _CURAND(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
    tic();
    _CURAND(curandGenerateUniform(gen, dev_rand_data, ntot));
    t = toc();
    printf("RNG in %f ms\n", t);

    _CURAND(curandDestroyGenerator(gen));

    size_t ncols = CONSTANT_COLS;
    size_t nrows = CONSTANT_ROWS;
    size_t runs = RUNS;

    cudaMemset(dev_y_cublas, 0, n_rows_max * sizeof(real_t));
    matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows, ncols);
    _CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols,
                                nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1));
    /* Compare results */
    compare_results(dev_y_cublas,dev_y, nrows);


    FILE * pFile;
    char filename[50];
#if (TEST_WRT_ == TEST_COLUMNS)
    sprintf(filename, "times_rows%lu_cols.txt", nrows);
#else
    sprintf(filename, "times_cols%lu_rows.txt", ncols);
#endif

    printf("Logging to : '%s'\n", filename);
    pFile = fopen(filename, "w");
    if (pFile == NULL) {
        perror("Error opening file.");
        exit(79);
    }


#if (TEST_WRT_ == TEST_COLUMNS)
    fprintf(pFile, "0, %lu, 0, 0\n", nrows);
    for (ncols = 32; ncols < n_cols_max; ncols += 32) {
#else
    fprintf(pFile, "1, %lu, 0, 0\n", ncols);
    for (nrows = 32; nrows < n_rows_max; nrows += 32) {
#endif
        tic();
        for (short i = 0; i < runs; i++) {
            matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows,
                    ncols);
        }
        t = toc() / runs;
        tic();
        for (short i = 0; i < runs; i++) {
            _CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols,
                            nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1));
        }
        t_cublas = toc() / runs;
#if (TEST_WRT_ == TEST_COLUMNS)
        fprintf(pFile, "%lu, %f, %f\n", ncols, t, t_cublas);
#else
        fprintf(pFile, "%lu, %f, %f\n", nrows, t, t_cublas);
#endif
    }
    _CUBLAS(cublasDestroy(handle));

    fclose(pFile);

    if (dev_rand_data != NULL)
        _CUDA(cudaFree(dev_rand_data));

    stop_tictoc();
}

int main(void)
{
    do_benchmark();

    return EXIT_SUCCESS;
}

Finally, this is a MATLAB script I'm using to plot the execution times:

fetch_this = 'times_cols512_rows.txt';

username = 'ubuntu';
target_hostname = 'jetson';
% Do not modify below this line
eval_this=['! scp ' username '@' target_hostname ':~/mv/Debug/' fetch_this ' .'];
eval(eval_this)

set(0, 'DefaultAxesFontSize', 14);
r = csvread(fetch_this);
r_header = r(1,:);

plot(r(2:end,1), r(2:end,2)*1000, '-'); 
hold on
plot(r(2:end,1), r(2:end,3)*1000, '-r'); 
grid on;

fig_title = 'Matvec on Tegra K1 - %d %s';
if (r_header(1)==1),
    xlabel('Number of rows');
    title(sprintf(fig_title, r_header(2),'columns'));
else
    xlabel('Number of columns');
    title(sprintf(fig_title, r_header(2),'rows'));
end
ylabel('Computation time [us]');


legend('Kernel', 'cuBLAS');
axis tight

I am concerned about the performance and the scalability of my kernel, so first I would like to know how to improve the scalability with respect to the number of rows of matrix A. Second, I know that it is not very good practice to have branch divergence (and my code has), but I'm feeling I want some hints to improve it.

UPDATE : Thanks to all your comments and suggestions, I reached the conclusion that cudaDeviceSynchronized() caused, in the first place, some peculiarities with my timing so my initial measurements were inaccurate. Row-major ordering leads to worse results. The size of the blocks is an important tuning parameter and changing from 16 to 32 or 64 improves the execution time. Further benchmarking is necessary to choose the block size. To this end, one may use the following API for the kernel:

template<typename T, const uint_t blk>
__global__ void matvec_kernel(const T * RESTRICT  dA, const T * RESTRICT  dx,
        T * RESTRICT dy, const uint_t nRows, const uint_t nx);

and call it like this from the host:

template<typename T>
__host__ void matvec(const T * RESTRICT dA, const T * RESTRICT dx,
        T * RESTRICT dy, const uint_t nRows, const uint_t nx) {
    uint_t blk_size_opt = 64;

    /* Add code to decide the value of `blk_size_opt` */

    if (blk_size_opt == 32) {
        matvec_engine<T, 32>(dA, dx, dy, nRows, nx);
    } else if (blk_size_opt == 64) {
        matvec_engine<T, 64>(dA, dx, dy, nRows, nx);
    } else if (blk_size_opt == 128) {
        matvec_engine<T, 128>(dA, dx, dy, nRows, nx);
    } else if (blk_size_opt == 256) {
        matvec_engine<T, 256>(dA, dx, dy, nRows, nx);
    }

}

Let me provide some benchmarking results. First a comparison with cublasSgemv:

Custom Kernel vs cuBLAS

and the effect of block size on the execution time:

enter image description here

Remember that your Jetson has only one Streaming Multiprocessor. In a such "elementary" approach, above a certain small value, further blocks can be executed only sequentially. Perhaps therefore your linear increase. — Vitality, Oct 17 '14 at 04:30
@JackOLantern OK, this is true, but still I'm running cuBLAS on the same platform. I'd like to know whether I can somehow get rid of divergent threads (*i.e.*, to get rid of these if blocks that account for inexact tiling...) Is there some standard practice? — Pantelis Sopasakis, Oct 17 '14 at 04:54
Could you please post a full version of your code so that perhaps someone could play with it? — Vitality, Oct 17 '14 at 07:41
@JackOLantern I updated my question with new benchmarking results and I also posted the new kernel and uploaded all the source code on github (see link in the question). I'm playing with the profiler trying to understand what can be improved in the code. — Pantelis Sopasakis, Oct 17 '14 at 20:44
The complete code belongs in the question, not an external link. The question is considerably less useful to future readers when that external link breaks. Your statement "cuBLAS does some magic since it seems that its execution is not affected by the number of columns of A, " doesn't make sense to me, since in all cases I see the execution time of CUBLAS increasing as the number of columns increases. — Robert Crovella, Oct 18 '14 at 04:38
Is the second graph labelled correctly? It seems to read that cuBlas is faster with 512 rows than 256? — Jez, Oct 18 '14 at 14:34
In your timing `toc()` function, you have a `cudaDeviceSynchronize()` call that is causing, on my GT540M, unstable timing. Also, I have the feeling that you are compiling in `Debug` mode since in your Matlab plotting code you are searching for `fetch_this` in a `Debug` directory. If you are requesting performance, you should compile in `Release` mode. — Vitality, Oct 18 '14 at 17:54
@RobertCrovella OK, I see. I just didn't want to overload the question. — Pantelis Sopasakis, Oct 18 '14 at 18:04
@JackOLantern Thanks for the hints! I'm re-running some benchmarks and I will update the question if I see some difference. — Pantelis Sopasakis, Oct 18 '14 at 18:05
Should the `beta` parameter of `cublasSgemv()` be equal to `0`, instead of `1`? Furthermore, have you checked that `cublasSgemv()` and `matvec` return the same results? Have you accounted that `cuBLAS` requires column-major ordering of the matrix? The result of the `cublasSgemv()` function in your code returns all `NaN`s to me. — Vitality, Oct 18 '14 at 18:58
@JackOLantern You're absolutely right! Actually I should have put a `cudaMemset(dev_y_cublas, 0, n_rows_max * sizeof(real_t));` (I updated my question now). I compare with `compare_results(dev_y_cublas,dev_y, nrows);` — Pantelis Sopasakis, Oct 18 '14 at 19:29
Your code works correctly now for square matrices, namely, when `CONSTANT_ROWS = CONSTANT_COLS`, but not yet when `CONSTANT_ROWS != CONSTANT_COLS`. Please, _correctness before optimization_. Also, there is no need to using `cudaMemset` for the `cuBLAS` case. Finally, as pointed out by @NicolasBertJohnson, you seem that you are assuming column-major ordering in your hand-written kernel, but this will lead to highly uncoalesced global memory reads. A more consistent comparison would be to consider row-major ordering and `CUBLAS_OP_N` for `cuBLAS`. — Vitality, Oct 19 '14 at 06:27
I think I fixed the above problem. `if (idx_x + row < nx) { x_shared[row] = xsub[row]; }` requires the `else` branch `else x_shared[row] = 0.f;`. — Vitality, Oct 19 '14 at 20:33
@JackOLantern Are you using the latest version of the code (in the question)? I've tested it for row and column sizes 1 to 512 and it works fine for all their combinations. I can post the code I used to test it if you like. — Pantelis Sopasakis, Oct 19 '14 at 20:37
Let us [continue this discussion in chat](http://chat.stackoverflow.com/rooms/63314/discussion-between-pantelis-sopasakis-and-jackolantern). — Pantelis Sopasakis, Oct 19 '14 at 20:37
I had a more careful look at your `__global__` function and ended up with the implementation at the pastebin page http://pastebin.com/MBnT8NNU. At variance with what claimed in the answer by @NicolasBertJohnson, since different threads take care of different rows, the column-major approach you have used will lead to coalesced memory accesses. In the pastebin code you will find both the cases of row-major and column-major assumptions. Of course, in the row-major case, please change `CUBLAS_OP_N` to `CUBLAS_OP_T`. You can test the two arrangements and notify on the results. — Vitality, Oct 20 '14 at 07:13
I would not pay too much attention to unrolling the case of exact tiling, since the aggressive optimizations performed by the CUDA compiler are typically enough to emit a disassembled code with unrolled loops. Furthermore, you may wish to tune the `BLOCK_SIZE`. `16` appears too small, and on my GT540M, a value of `32` is capable to achieve a speedup factor of `4`. I will post a full answer as long as I test a couple of other optimization possibilities. — Vitality, Oct 20 '14 at 07:17
@JackOLantern Thanks a lot for the code. I tried a variant of your kernel and it performs also equally good for a block size of `128`. Yours, on my hardware, works best for a block size of `32`. It seems that the most important tuning parameter was the block size after all! Again, thanks a lot! — Pantelis Sopasakis, Oct 20 '14 at 14:36
I have provided an answer to your post. I'm showing how, on a GT540M card, `BLOCK_SIZE` is an optimization parameter and how the column-major ordering is better than row-major ordering. I have also provided a kernel, that you may wish to test on the Jetson, using Instruction Level Parallelism and which may help improving performance. Also, since you are using a device with compute capability `3.2`, you may wish to try using shuffle operations. I have also provided a kernel using those kind of operations which have shown to improve performance on a Kepler K20c against shared memory. — Vitality, Oct 21 '14 at 21:00
Please, notice that in the code above, you still have the `cudaDeviceSynchronize()` call and you still have `beta = 1.0`. Please, update :-) — Vitality, Oct 21 '14 at 21:01
I wonder if changing static shared array to dynamic one, and calling kernel with 3 parameter, would downgrade performance? — yourstruly, Jul 27 '18 at 15:08

score 6 · Accepted Answer · answered Oct 21 '14 at 20:56

First, let me write down the full working Matrix-Vector multiplication kernel employing shared memory:

template<typename T>
__global__ void matvec_kernel(const T * __restrict__ dA, const T * __restrict__ dx, T * __restrict__ dy, const unsigned int nRows, const unsigned int nCols)
{
    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;

    __shared__ T x_shared[BLOCK_SIZE];

    T y_val = 0.0;

    #pragma unroll
    for (unsigned int m = 0; m < ((nCols + BLOCK_SIZE - 1)/ BLOCK_SIZE); ++m)
    {
        if ((m * BLOCK_SIZE + threadIdx.x) <  nCols) x_shared[threadIdx.x] = dx[threadIdx.x + m * BLOCK_SIZE];
        else                                         x_shared[threadIdx.x] = 0.f;
        __syncthreads();

        #pragma unroll
        for (unsigned int e = 0; e < BLOCK_SIZE; ++e) {
            // --- Column-major ordering - faster
            y_val += dA[tid + (e + BLOCK_SIZE * m) * nRows] * x_shared[e];
            // --- Row-major ordering - slower
            //y_val += dA[tid * nCols + (e + BLOCK_SIZE * m)] * x_shared[e];
        }

        __syncthreads();
    }

    if (tid < nRows) dy[tid] = y_val;

}

Unless differently specified, all the tests will be done on a GT540M card.

A first parameter to be optimized is the BLOCK_SIZE. Changing the BLOCK_SIZE changes the algorithm performance, as witnessed by the following graph:

enter image description here

The following graphs compares row-major ordering vs. column-major ordering. The latter is faster:

enter image description here

Another optimization you may wish to try is using more Instruction Level Parallelism (ILP) by this modified kernel employing ILP = 2

template<typename T>
__global__ void matvec_kernel_ILP2(const T * __restrict__ dA, const T * __restrict__ dx, T * __restrict__ dy, const unsigned int nRows, const unsigned int nCols)
{
    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;

    __shared__ T x_shared[BLOCK_SIZE];

    T y_val1 = 0.0;
    T y_val2 = 0.0;

    #pragma unroll
    for (unsigned int m = 0; m < ((nCols + BLOCK_SIZE - 1)/ BLOCK_SIZE); ++m)
    {
        if ((m * BLOCK_SIZE + threadIdx.x) <  nCols) x_shared[threadIdx.x] = dx[threadIdx.x + m * BLOCK_SIZE];
        else                                         x_shared[threadIdx.x] = 0.f;
        __syncthreads();

        #pragma unroll
        for (unsigned int e = 0; e < BLOCK_SIZE; ++e) {
            y_val1 += dA[tid + (e + BLOCK_SIZE * m) * nRows] * x_shared[e];
            y_val2 += dA[tid + gridDim.x * BLOCK_SIZE + (e + BLOCK_SIZE * m) * nRows] * x_shared[e];
        }

        __syncthreads();
    }

    if (tid < nRows) dy[tid] = y_val1;
    if ((tid + gridDim.x * BLOCK_SIZE) < nRows) dy[tid + gridDim.x * BLOCK_SIZE] = y_val2;

}

This kernel should be called with half of the threads, as

dim3 dim_grid((nRows/2 + BLOCK_SIZE -1)/ BLOCK_SIZE);
dim3 dim_block(BLOCK_SIZE);
matvec_kernel_ILP2<T> <<<dim_grid, dim_block>>>(dA, dx, dy, nRows, nx);

Finally, since you are using a device with compute capability 3.2, you can try using shuffle operations. I'm providing here the kernel using shuffle operations instead of shared memory. In this case, you should set BLOCK_SIZE = 32:

template<typename T>
__global__ void matvec_kernel_shfl(const T * __restrict__ dA, const T * __restrict__ dx, T * __restrict__ dy, const unsigned int nRows, const unsigned int nCols)
{
    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;

    T x_shfl_src, x_shfl_dest;

    T y_val = 0.0;

    #pragma unroll
    for (unsigned int m = 0; m < ((nCols + BLOCK_SIZE - 1)/ BLOCK_SIZE); ++m)
    {
        if ((m * BLOCK_SIZE + threadIdx.x) <  nCols) x_shfl_src = dx[threadIdx.x + m * BLOCK_SIZE];
        else                                         x_shfl_src = 0.f;
        __syncthreads();

//        #pragma unroll
        for (int e = 0; e < 32; ++e) {
            // --- Column-major ordering - faster
            x_shfl_dest = __shfl(x_shfl_src, e);
            y_val += dA[tid + (e + BLOCK_SIZE * m) * nRows] * x_shfl_dest;
            // --- Row-major ordering - slower
            //y_val += dA[tid * nCols + (e + BLOCK_SIZE * m)] * x_shared[e];
        }

        __syncthreads();
    }

    if (tid < nRows) dy[tid] = y_val;

}

Shuffle operations improve the performance over shared memory for BLOCK_SIZE = 32 on a Kepler K20c as shown by the graph below:

enter image description here

I tried the solution with `__shfl` and, at least on my HW, is on a par with the implementation using shared memory (both with a block size of 32). I was by the way introduced to Warp Shuffle Functions! — Pantelis Sopasakis, Oct 22 '14 at 01:33

score 3 · Answer 2 · answered Oct 19 '14 at 03:09

Looking at your code I think that the way you traverse the elements of A may be the problem:

for (unsigned int e = 0; e < n_star; ++e) {
  y_val += Asub[row + e * nRows] * x_shared[e];
}

So, when nRows becomes large, you actually read from the global memory (that is where A is stored) with a large stride. In particular this happens in every block: threads inside the same block will read from the global memory in a non-consecutive fashion. This can be improved if you consider storing from the beginning the values of A row-by-row (i.e., using row-major order). This is just a guess and I would have written a comment, but it requires a higher score on Stackoverflow...

After a more careful examination of the poster's code, it seems that each thread is taking care of dealing with a separate matrix row. So, the column-major ordering assumed by the poster seems already to lead to coalesced memory accesses and thus to more effective access patterns. — Vitality, Oct 20 '14 at 07:21

Matrix-vector multiplication in CUDA: benchmarking & performance

2 Answers2

Linked