Adapt existing code and Kernel code to perform a high number of 3x3 matrix inversion

Question

Following a previous question ( Performing high number of 4x4 matrix inversion - PyCuda ), considering the inversion of 4x4 matrix, I would like to do the same but with 3x3 matrix. As @Robert Crovella said, this change implies a complete rewrite.

Given the code shown below, I tried to test some things like putting zeros instead of values but this method doesn't seem to work.

Here is the code working for a high number of 4x4 matrix inversion:

$ cat t10.py
import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""

__device__ unsigned getoff(unsigned &off){
  unsigned ret = off & 0x0F;
  off = off >> 4;
  return ret;
}

const int block_size = 256;
const unsigned tmsk = 0xFFFFFFFF;
// in-place is acceptable i.e. out == in)
// T = float or double only
typedef float T;
__global__ void inv4x4(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){

  __shared__ T si[block_size];
  size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < n*16){
    si[threadIdx.x] = in[idx];
    unsigned lane = threadIdx.x & 15;
    unsigned sibase = threadIdx.x & 0x03F0;
    __syncwarp();
    unsigned off = pat[lane];
    T a,b;
    a  = si[sibase + getoff(off)];
    a *= si[sibase + getoff(off)];
    a *= si[sibase + getoff(off)];
    if (!getoff(off)) a = -a;
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    off = pat[lane+16];
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    off = pat[lane+32];
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    T det = si[sibase + (lane>>2)]*a;
    det += __shfl_down_sync(tmsk, det, 4, 16); // first add
    det += __shfl_down_sync(tmsk, det, 8, 16); // second add
    det =  __shfl_sync(tmsk, det, 0, 16); // broadcast
    out[idx] = a / det;
  }
}

""")
# python function for inverting 4x4 matrices
# n should be an even number
def gpuinv4x4(inp, n):
    # internal constants not to be modified
    hpat = ( 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50, 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14, 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618)
    # Convert parameters into numpy array
    inpd = np.array(inp, dtype=np.float32)
    hpatd = np.array(hpat, dtype=np.uint32)
    output = np.empty((n*16), dtype= np.float32)
    # Get kernel function
    matinv4x4 = kernel.get_function("inv4x4")
    # Define block, grid and compute
    blockDim = (256,1,1) # do not change
    gridDim = ((n/16)+1,1,1)
    # Kernel function
    matinv4x4 (
        cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
        block=blockDim, grid=gridDim)
    return output
#example/test case
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv4x4(inp, n)
print(result.reshape(2,4,4))
$ python t10.py

[[-3.   -0.5   1.5   1.  ]
  [ 1.    0.25 -0.25 -0.5 ]
  [ 3.    0.25 -1.25 -0.5 ]
  [-3.   -0.    1.    1.  ]]

[[ 1.    0.    0.    0.  ]
  [ 0.    1.    0.    0.  ]
  [ 0.    0.    1.    0.  ]
  [ 0.    0.    0.    1.  ]]

I expect the same behavior, except that I am not longer working with 4x4 matrix but with 3x3 matrix.

How can I adapt this code above to work with 3x3 matrix inversion?

Update 1

Here the modifications that I made.

I have modified the dimension and used direct formula from the link given by @Robert Crovella (https://ardoris.wordpress.com/2008/07/18/general-formula-for-the-inverse-of-a-3x3-matrix/). Below the code modified:

import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit


 # kernel of 3x3 inversion
kernel_3x3 = SourceModule("""

    // in-place is acceptable i.e. out == in)
    // T = float or double only

typedef float T;
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){

  size_t ix = threadIdx.x;
  size_t idx = ix + blockDim.x*blockIdx.x;
  if (ix < n*9){
    T det = in[0+idx]*(in[4+idx]*in[8+idx]-in[7+idx]*in[5+idx]) - in[1+idx]*(in[3+idx]*in[8+idx]-in[6+idx]*in[5+idx]) + in[2+idx]*(in[3+idx]*in[7+idx]-in[6+idx]*in[4+idx]);
    out[0+idx] = (in[4+idx]*in[8+idx]-in[7+idx]*in[5+idx])/det;
    out[1+idx] = (in[2+idx]*in[7+idx]-in[1+idx]*in[8+idx])/det;
    out[2+idx] = (in[1+idx]*in[5+idx]-in[2+idx]*in[4+idx])/det;
    out[3+idx] = (in[6+idx]*in[5+idx]-in[3+idx]*in[8+idx])/det;
    out[4+idx] = (in[0+idx]*in[8+idx]-in[2+idx]*in[6+idx])/det;
    out[5+idx] = (in[2+idx]*in[3+idx]-in[0+idx]*in[5+idx])/det;
    out[6+idx] = (in[3+idx]*in[7+idx]-in[4+idx]*in[6+idx])/det;
    out[7+idx] = (in[1+idx]*in[6+idx]-in[0+idx]*in[7+idx])/det;
    out[8+idx] = (in[0+idx]*in[4+idx]-in[1+idx]*in[3+idx])/det;
    __syncwarp(); 
  }
}

""")

def gpuinv3x3 (inp, n):
# internal constants not to be modified
  hpat = ( 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50, 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14, 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618)
# Convert parameters into numpy array
  inpd = np.array(inp, dtype=np.float32)
  hpatd = np.array(hpat, dtype=np.uint32)
  output = np.empty((n*9), dtype= np.float32)
  # Get kernel function
  matinv3x3 = kernel_3x3.get_function("inv3x3")
  # Define block, grid and compute
  blockDim = (81,1,1) # do not change
  gridDim = ((n/9)+1,1,1)
  # Kernel function
  matinv3x3 (
      cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
      block=blockDim, grid=gridDim)
  return output
#example/test case
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv3x3(inp, n)
print(result.reshape(2,3,3))

The first matrix is correctly inverted but not the second one (the identity matrix which has identity matrix as inverse) :

[[[ 2.         -0.         -1.        ]
  [-1.         -0.33333334  1.        ]
  [-0.          0.33333334 -0.        ]]

 [[ 1.         -0.5        -0.        ]
  [       -inf  1.         -1.        ]
  [        nan         nan  1.        ]]]

So, this issue doesn't seem to come from the kernel code but rather the batch size or something similar with dimensions of global 1D array (in my code, you can see the 2 3x3 matrices formatted as 1D array of 18 elements (inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0))).

What's wrong in this code? Especially, the issue of bad inversion on the second matrix. Just a last point, an odd size of group doesn't imply issues for GPU processing?

Have you considered the possibility that adding zeros makes the matrix singular and makes inversion impossible? — talonmies, Mar 26 '19 at 13:13
@talonmies : you are right, I can't add one or several rows/columns of zero since determinant would be null and so matrix not inversible ... how to deal with this zero padding ? — , Mar 26 '19 at 13:27
You don't. Zero padding is mathematically impossible. Do the rewrite. — talonmies, Mar 26 '19 at 17:54
This code requires a rewrite. There is no simple set of changes to make it suitable for 3x3 matrix inversion. If you need to invert matrices of many different sizes, then it might be smarter to bite the bullet and just create a general binding for whatever functions you wish to use from `cublas`. If you wanted to do just a 3x3 matrix inversion similar to the fixed-function code strategy here, you could use [this](https://ardoris.wordpress.com/2008/07/18/general-formula-for-the-inverse-of-a-3x3-matrix/) as a starting point. Not a complete description, just a starting point. — Robert Crovella, Mar 28 '19 at 21:00
@RobertCrovella . I have followed the formula from the link that you gave me. I get a valid inversion for the first matrix but not the second one. If you could take a look at my **UPDATE1**, this would be fine. I suspect a bad dimension value to take into acccount the 1D global array representing the different matrices to invert. Regards — , Mar 29 '19 at 16:19

Robert Crovella · Accepted Answer · 2019-03-30T14:11:51.647

This answer will closely follow my answer on the 4x4 invert question, both in terms of answer layout and calculation method/kernel design. The formulas are described here.

First, as before, we will show a CUDA C++ version with comparison to cublas:

$ cat t432.cu
#include <iostream>
#include <cublas_v2.h>
#include <cstdlib>
// 3x3 matrix inversion
// https://stackoverflow.com/questions/1148309/inverting-a-4x4-matrix
// https://ardoris.wordpress.com/2008/07/18/general-formula-for-the-inverse-of-a-3x3-matrix/
// 9 threads per matrix to invert
// 32 matrices per 288 thread block

const unsigned block_size = 288;
typedef double mt;

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

__device__ unsigned pat[9];
const unsigned hpat[9] = {0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140};


__device__ unsigned getoff(unsigned &off){
  unsigned ret = off & 0x0F;
  off >>= 4;
  return ret;
}

// in-place is acceptable i.e. out == in)
// T = float or double only
template <typename T>
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n){

  __shared__ T si[block_size];
  size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
  T det = 1;
  if (idx < n*9)
    det = in[idx];
  unsigned sibase = (threadIdx.x / 9)*9;
  unsigned lane = threadIdx.x - sibase; // cheaper modulo
  si[threadIdx.x] = det;
  __syncthreads();
  unsigned off = pat[lane];
  T a  = si[sibase + getoff(off)];
  a   *= si[sibase + getoff(off)];
  T b  = si[sibase + getoff(off)];
  b   *= si[sibase + getoff(off)];
  a -= b;
  __syncthreads();
  if (lane == 0) si[sibase+3] = a;
  if (lane == 3) si[sibase+4] = a;
  if (lane == 6) si[sibase+5] = a;
  __syncthreads();
  det =  si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
  if (idx < n*9)
    out[idx] = a / det;
}

size_t nr = 2048;
int main(int argc, char *argv[]){
  if (argc > 1) nr = atoi(argv[1]);


  const mt m2[] = {1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0};
  const mt i2[] = {2.0, 0.0, -1.0, -1.0, -0.33333334, 1.0, 0.0, 0.33333334,  0.0};
  const mt m1[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
  const mt i1[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};

  mt *h_d, *d_d;
  h_d = (mt *)malloc(nr*9*sizeof(mt));
  cudaMalloc(&d_d, nr*9*sizeof(mt));
  cudaMemcpyToSymbol(pat, hpat, 9*sizeof(unsigned));
  for (int i = 0; i < nr/2; i++){
    memcpy(h_d+i*2*9, m1, sizeof(m1));
    memcpy(h_d+i*2*9+9, m2, sizeof(m2));}
  cudaMemcpy(d_d, h_d, nr*9*sizeof(mt), cudaMemcpyHostToDevice);
  long long t = dtime_usec(0);
  inv3x3<<<((nr*9)/block_size)+1, block_size>>>(d_d, d_d, nr);
  cudaDeviceSynchronize();
  t = dtime_usec(t);
  cudaMemcpy(h_d, d_d, nr*9*sizeof(mt), cudaMemcpyDeviceToHost);
  for (int i = 0; i < 2; i++){
    for (int j = 0; j < 9; j++) std::cout << h_d[i*9 + j] << ",";
    std::cout << std::endl;
    for (int j = 0; j < 9; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
    std::cout << std::endl;}
  std::cout << "kernel time: " << t << " microseconds" << std::endl;
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
  //cublas
  for (int i = 0; i < nr/2; i++){
    memcpy(h_d+i*2*9, m1, sizeof(m1));
    memcpy(h_d+i*2*9+9, m2, sizeof(m2));}
  cudaMemcpy(d_d, h_d, nr*9*sizeof(mt), cudaMemcpyHostToDevice);
  cublasHandle_t h;
  cublasStatus_t cs = cublasCreate(&h);
  if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas create error" << std::endl;
  mt **A, **Ai, *Aid, **Ap, **Aip;
  A  = (mt **)malloc(nr*sizeof(mt *));
  Ai = (mt **)malloc(nr*sizeof(mt *));
  cudaMalloc(&Aid, nr*9*sizeof(mt));
  cudaMalloc(&Ap,  nr*sizeof(mt *));
  cudaMalloc(&Aip, nr*sizeof(mt *));
  for (int i = 0; i < nr; i++) A[i]  =  d_d + 9*i;
  for (int i = 0; i < nr; i++) Ai[i] =  Aid + 9*i;
  cudaMemcpy(Ap, A, nr*sizeof(mt *), cudaMemcpyHostToDevice);
  cudaMemcpy(Aip, Ai, nr*sizeof(mt *), cudaMemcpyHostToDevice);
  int *info;
  cudaMalloc(&info, nr*sizeof(int));
  t = dtime_usec(0);
  cs = cublasDmatinvBatched(h, 3,  Ap, 3, Aip, 3, info, nr);
  if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas matinv error" << std::endl;
  cudaDeviceSynchronize();
  t = dtime_usec(t);
  cudaMemcpy(h_d, Aid, nr*9*sizeof(mt), cudaMemcpyDeviceToHost);
  for (int i = 0; i < 2; i++){
    for (int j = 0; j < 9; j++) std::cout << h_d[i*9 + j] << ",";
    std::cout << std::endl;
    for (int j = 0; j < 9; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
    std::cout << std::endl;}
  std::cout << "cublas time: " << t << " microseconds" << std::endl;
  err = cudaGetLastError();
  if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
  return 0;
}
$ nvcc -o t432 t432.cu -lcublas
$ ./t432
1,0,0,0,1,0,0,0,1,
1,0,0,0,1,0,0,0,1,
2,-0,-1,-1,-0.333333,1,-0,0.333333,-0,
2,0,-1,-1,-0.333333,1,0,0.333333,0,
kernel time: 59 microseconds
1,0,0,0,1,0,0,0,1,
1,0,0,0,1,0,0,0,1,
2,0,-1,-1,-0.333333,1,0,0.333333,0,
2,0,-1,-1,-0.333333,1,0,0.333333,0,
cublas time: 68 microseconds
$

So this is perhaps slightly faster than cublas but not much, for this 2048 matrix test case, CUDA 10.0, Tesla P100, linux.

Similar to the previous answer, here is a simplified (only 2 matrices) pycuda test case:

$ cat t14.py
import numpy as np
# import matplotlib.pyplot as plt
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""

__device__ unsigned getoff(unsigned &off){
  unsigned ret = off & 0x0F;
  off >>= 4;
  return ret;
}

// in-place is acceptable i.e. out == in)
// T = float or double only
const int block_size = 288;
typedef double T; // *** can set to float or double
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){

  __shared__ T si[block_size];
  size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
  T det = 1;
  if (idx < n*9)
    det = in[idx];
  unsigned sibase = (threadIdx.x / 9)*9;
  unsigned lane = threadIdx.x - sibase; // cheaper modulo
  si[threadIdx.x] = det;
  __syncthreads();
  unsigned off = pat[lane];
  T a  = si[sibase + getoff(off)];
  a   *= si[sibase + getoff(off)];
  T b  = si[sibase + getoff(off)];
  b   *= si[sibase + getoff(off)];
  a -= b;
  __syncthreads();
  if (lane == 0) si[sibase+3] = a;
  if (lane == 3) si[sibase+4] = a;
  if (lane == 6) si[sibase+5] = a;
  __syncthreads();
  det =  si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
  if (idx < n*9)
    out[idx] = a / det;
}

""")
# host code
def gpuinv3x3(inp, n):
    # internal constants not to be modified
    hpat = (0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140)
    # Convert parameters into numpy array
    # *** change next line between float32 and float64 to match float or double
    inpd = np.array(inp, dtype=np.float64)
    hpatd = np.array(hpat, dtype=np.uint32)
    # *** change next line between float32 and float64 to match float or double
    output = np.empty((n*9), dtype= np.float64)
    # Get kernel function
    matinv3x3 = kernel.get_function("inv3x3")
    # Define block, grid and compute
    blockDim = (288,1,1) # do not change
    gridDim = ((n/32)+1,1,1)
    # Kernel function
    matinv3x3 (
        cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
        block=blockDim, grid=gridDim)
    return output
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv3x3(inp, n)
print(result.reshape(2,3,3))
$ python t14.py
[[[ 2.         -0.         -1.        ]
  [-1.         -0.33333333  1.        ]
  [-0.          0.33333333 -0.        ]]

 [[ 1.          0.          0.        ]
  [ 0.          1.          0.        ]
  [ 0.          0.          1.        ]]]
$

The above happens to be using double i.e. float64 in pycuda. Changing it to float i.e. float32 in pycuda involves changing the same 3 lines as described in this answer.

Thank you, I will ask you later some explanations about your kernel code since I have notions about OpenCL but not really with Cuda, even if both are similar. Regards — , Mar 30 '19 at 22:09
Hello Robert, I have another issue with your method : this is explained on https://stackoverflow.com/questions/55466093/trying-to-get-with-pycuda-inversion-matrix-the-same-results-than-with-np-linalg . Any idea is welcome, thanks. — , Apr 02 '19 at 02:29
This code is now extensively described [here](https://stackoverflow.com/questions/59056195/understanding-in-details-the-algorithm-for-inversion-of-a-high-number-of-3x3-mat). — Robert Crovella, Nov 26 '19 at 22:17

Adapt existing code and Kernel code to perform a high number of 3x3 matrix inversion

Update 1

1 Answers1

Linked