CUDA: Matrix inversion slower on GPU than on CPU

Question

I have a stable, simple Gauss-Jordan algorithm for calculating the matrix inversion on the CPU. I tried to transfer this algorithm to the GPU, it works fine, but the speed has dropped significantly, about 10 times. I understand that I am not well versed in C++ and CUDA, and I would be glad to hear some advice. Thanks in advance.

Console:

matrix size = 88; CPU time = 6531 mcs; GPU time = 52806 mcs;

Here is the code for the CPU:

void GaussJordanInverse(float* original, float* temp, float* singular, float* bigmatrix, int size) {
    //create temp
    //create singular matrix
    for (int i = 0; i < size; i++)
    {
        for (int j = 0; j < size; j++) {
            temp[i * size + j] = original[i * size + j];
            singular[i * size + j] = 0; 
        }
        singular[i * size + i] = 1;
    }
    //create big matrix
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++)
        {
            bigmatrix[i * (size * 2) + j] = temp[i * size + j];
            bigmatrix[i * (size * 2) + j + size] = singular[i * size + j];
        }
    }
    //direct
    for (int k = 0; k < size; k++)
    {
        for (int i = 0; i < 2 * size; i++) bigmatrix[k * (size * 2) + i] = bigmatrix[k * (size * 2) + i] / temp[k * size + k];
        for (int i = k + 1; i < size; i++) {
            float K = bigmatrix[i * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
            for (int j = 0; j < 2 * size; j++)bigmatrix[i * (size * 2) + j] = bigmatrix[i * (size * 2) + j] - bigmatrix[k * (size * 2) + j] * K;
        }
        for (int i = 0; i < size; i++) {
            for (int j = 0; j < size; j++)
            {
                temp[i * size + j] = bigmatrix[i * (size * 2) + j];
            }
        }
    }
    //indirect
    for (int k = size - 1; k > -1; k--)
    {
        for (int i = 2 * size - 1; i > -1; i--) bigmatrix[k * (size * 2) + i] = bigmatrix[k * (size * 2) + i] / temp[k * size + k];
        for (int i = k - 1; i > -1; i--)
        {
            float K = bigmatrix[i * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
            for (int j = 2 * size - 1; j > -1; j--)
                bigmatrix[i * (size * 2) + j] = bigmatrix[i * (size * 2) + j] - bigmatrix[k * (size * 2) + j] * K;
        }
    }
    //cut
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++)
        {
            singular[i * size + j] = bigmatrix[i * (size * 2) + j + size];
            original[i * size + j] = singular[i * size + j];
        }
    }
}

Here is the code for the GPU:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <iostream>
#ifndef __CUDACC__  
#define __CUDACC__
#endif
#include <device_functions.h>
#include <stdio.h>
#include <chrono>
#include "matrixconversion.h"
using namespace std;
using namespace std::chrono;

__global__ void create(float* original, float* temp, float* singular, float* bigmatrix, int size) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
    //create singular matrix
    if (x < size && y < size)
    {
        temp[x * size + y] = original[x * size + y];
        if (x == y) singular[x * size + y] = 1;
        //singular[x * size + y] = 0;
        //create big matrix
        bigmatrix[x * (2 * size) + y] = temp[x * size + y];
        bigmatrix[x * (2 * size) + y + size] = singular[x * size + y];
    }
}
__global__ void direct_kernel(float* temp, float* bigmatrix, int size, int k) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    if (x < 2 * size) {
        bigmatrix[k * (size * 2) + x] /= temp[k * size + k];
    }
    __syncthreads();
    if (x >= k + 1 && x < size * 2) {
        float K = bigmatrix[x * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
        for (int j = 0; j < 2 * size; j++) {
            bigmatrix[x * (size * 2) + j] -= bigmatrix[k * (size * 2) + j] * K;
        }
    }
    __syncthreads();
    if (x < size) {
        for (int j = 0; j < size; j++) {
            temp[x * size + j] = bigmatrix[x * (size * 2) + j];
        }
    }
}
__global__ void indirect_kernel(float* temp, float* bigmatrix, int size, int k) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;

    if (x < 2 * size) {
        bigmatrix[k * (size * 2) + x] /= temp[k * size + k];
    }
    __syncthreads();
    if (x < k&& x >= 0) {
        float K = bigmatrix[x * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
        for (int j = 2 * size - 1; j > -1; j--) {
            bigmatrix[x * (size * 2) + j] -= bigmatrix[k * (size * 2) + j] * K;
        }
    }
    __syncthreads();
}

__global__ void cut(float* original, float* singular, float* bigmatrix, int size) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;

    if (x < size && y < size) {
        singular[x * size + y] = bigmatrix[x * (size * 2) + y + size];
        original[x * size + y] = singular[x * size + y];
    }
}

void GPUInverse(float* copy, float* original, int size, long int* time) {
    Copy1DFloat(copy, original, size);
    //set kernels
    int tr = 16;
    int bl = (int)ceil(size / tr);
    dim3 grid_size(16, 16);
    dim3 block_size(bl, bl);
    //create temp
    float* d_copy, * d_temp, * d_singular, * d_bigmatrix;
    //memory
    cudaMalloc((void**)&d_copy, size * size * sizeof(float));
    cudaMalloc((void**)&d_temp, size * size * sizeof(float));
    cudaMalloc((void**)&d_singular, size * size * sizeof(float));
    cudaMalloc((void**)&d_bigmatrix, size * (size * 2) * sizeof(float));
    cudaMemset(d_copy, 0, size * size * sizeof(float));
    cudaMemset(d_temp, 0, size * size * sizeof(float));
    cudaMemset(d_singular, 0, size * size * sizeof(float));
    cudaMemset(d_bigmatrix, 0, size * (2 * size) * sizeof(float));
    //to device
    cudaMemcpy(d_copy, copy, size * size * sizeof(float), cudaMemcpyHostToDevice);

    auto start = high_resolution_clock::now();
    //fill temp
    create <<<grid_size, block_size >>> (d_copy, d_temp, d_singular, d_bigmatrix, size);
    cudaDeviceSynchronize();
    //direct
    for (int k = 0; k < size; k++) {
        direct_kernel <<<(2 * size + 255) / 256, 256 >>> (d_temp, d_bigmatrix, size, k);
        cudaDeviceSynchronize();
    }
    //indirect
    for (int k = size - 1; k > -1; k--) {
        indirect_kernel <<<(2 * size + 255) / 256, 256 >>> (d_temp, d_bigmatrix, size, k);
        cudaDeviceSynchronize();
    }
    //cut
    cut <<<grid_size, block_size >>> (d_copy, d_singular, d_bigmatrix, size);
    cudaDeviceSynchronize();

    auto stop = high_resolution_clock::now();
    auto duration = duration_cast<microseconds>(stop - start);
    *time = duration.count();
    //from device
    cudaMemcpy(copy, d_copy, size * size * sizeof(float), cudaMemcpyDeviceToHost);
    //clean

    cudaFree(d_copy);
    cudaFree(d_temp);
    cudaFree(d_singular);
    cudaFree(d_bigmatrix);
}

For [coalescing](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#coalesced-access-to-global-memory) you should use `threadIdx.x` for the inner dimension of the matrices when possible. I.e. exchanging `x` and `y` in your kernels could potentially give some easy speedup. — paleonix, May 02 '23 at 18:31
Offtopic, but FYI [Avoid `high_resolution_clock`, use `steady_clock` for benchmarking](https://stackoverflow.com/a/37440647/10107454), [What is the canonical way to check for errors using the CUDA runtime API?](https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) — paleonix, May 02 '23 at 18:33
`__syncthreads()` only synchronizes among each block of threads. Your usage looks like you are assuming that it synchronizes the whole grid. — paleonix, May 02 '23 at 18:36
The old-school way of getting a grid-sync is just a kernel boundary (i.e. split your kernels into more kernels). Alternatively you can use the newer [cooperative groups API](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#grid-synchronization). — paleonix, May 02 '23 at 18:42
Avoid `cudaDeviceSynchronize()` between kernel calls. As long as all kernels are launched on the same (default) stream, there is no need to synchronize between them. — paleonix, May 02 '23 at 18:45
While your launch configuration for `direct_kernel` and `indirect_kernel` looks fine, `block_size` and `grid_size` seem weird. Why not have a static `dim3 block_size(16, 16);` and a dynamic `dim3 grid_size((size + block_size.x - 1) / block_size.x, (size + block_size.y - 1) / block_size.y);`, or even better for coalescing due to warp size: `dim3 block_size(32, 8);` — paleonix, May 02 '23 at 18:53
The GPU solution will probably never be faster than the CPU solution for `size == 88`, there is just not enough parallelism. If at all, you will need to compare bigger problem sizes. But as you have sequential loops on the host that also grow with `size` (i.e. total of kernel launch overheads will also grow with `size`), I'm generally not too optimistic. — paleonix, May 02 '23 at 19:07

score 1 · Answer 1 · answered May 02 '23 at 21:09

Thank you paleonix for your advices. Indeed, the more data you have, the better you can see the difference in speedup between the CPU and GPU. In addition, the removal of cudaDeviceSynchronize() helped to speed up the work even more. Now the console looks like this:

matrix size = 88; CPU time = 7096 mcs; GPU time = 418 mcs;
matrix size = 1000; CPU time = 9834535 mcs; GPU time = 4347 mcs;

CUDA: Matrix inversion slower on GPU than on CPU

1 Answers1