I have a first version of a function that inverses a matrix of size m
and using
magma_dgetrf_gpu
and magma_dgetri_gpu
like this :
// Inversion
magma_dgetrf_gpu( m, m, d_a, m, piv, &info);
magma_dgetri_gpu( m, d_a, m, piv, dwork, ldwork, &info);
Now, I would like to also inverse but using the Cholesky decomposition. The function looks like the first version one,except the functions used which are :
// Inversion
magma_dpotrf_gpu( MagmaLower, m, d_a, m, &info);
magma_dpotri_gpu( MagmaLower, m, d_a, m, &info);
Here is the entire function that inverses :
// ROUTINE MAGMA TO INVERSE WITH CHOLESKY
void matrix_inverse_magma(vector<vector<double>> const &F_matrix, vector<vector<double>> &F_output) {
// Index for loop and arrays
int i, j, ip, idx;
// Start magma part
magma_int_t m = F_matrix.size();
if (m) {
magma_init (); // initialize Magma
magma_queue_t queue=NULL;
magma_int_t dev=0;
magma_queue_create(dev ,&queue );
double gpu_time , *dwork; // dwork - workspace
magma_int_t ldwork; // size of dwork
magma_int_t *piv, info; // piv - array of indices of inter -
// changed rows; a - mxm matrix
magma_int_t mm=m*m; // size of a, r, c
double *a; // a- mxm matrix on the host
double *d_a; // d_a - mxm matrix a on the device
magma_int_t err;
ldwork = m * magma_get_dgetri_nb( m ); // optimal block size
// allocate matrices
err = magma_dmalloc_cpu( &a , mm ); // host memory for a
// Convert matrix to *a double pointer
for (i = 0; i<m; i++){
for (j = 0; j<m; j++){
idx = i*m + j;
a[idx] = F_matrix[i][j];
}
}
err = magma_dmalloc( &d_a , mm ); // device memory for a
err = magma_dmalloc( &dwork , ldwork );// dev. mem. for ldwork
piv=( magma_int_t *) malloc(m*sizeof(magma_int_t ));// host mem.
magma_dsetmatrix( m, m, a, m, d_a, m, queue); // copy a -> d_a
// Inversion
magma_dpotrf_gpu( MagmaLower, m, d_a, m, &info);
magma_dpotri_gpu( MagmaLower, m, d_a, m, &info);
magma_dgetmatrix( m, m, d_a , m, a, m, queue); // copy d_a ->a
// Save Final matrix
for (i = 0; i<m; i++){
for (j = 0; j<m; j++){
idx = i*m + j;
F_output[i][j] = a[idx];
}
}
free(a); // free host memory
free(piv); // free host memory
magma_free(d_a); // free device memory
magma_queue_destroy(queue); // destroy queue
magma_finalize ();
// End magma part
}
}
Unfortunately, after checking the output data, I have a wrong inversion with my implementation.
I have doubts about the using at this line :
ldwork = m * magma_get_dgetri_nb( m ); // optimal block size
Could anyone see at first sight where the error comes from in my using of dpotrf
and dpotri
functions (actually magma_dpotrf_gpu
and magma_dpotri_gpu
) ?
EDIT 1:
following the advice of Damir Tenishev, I put an example of a function that inverses a matrix using LAPACKE :
// LAPACK version
void matrix_inverse_lapack(vector<vector<double>> const &F_matrix, vector<vector<double>> &F_output) {
// Index for loop and arrays
int i, j, ip, idx;
// Size of F_matrix
int N = F_matrix.size();
int *IPIV = new int[N];
// Statement of main array to inverse
double *arr = new double[N*N];
// Output Diagonal block
double *diag = new double[N];
for (i = 0; i<N; i++){
for (j = 0; j<N; j++){
idx = i*N + j;
arr[idx] = F_matrix[i][j];
}
}
// LAPACKE routines
int info1 = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, arr, N, IPIV);
int info2 = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N, arr, N, IPIV);
for (i = 0; i<N; i++){
for (j = 0; j<N; j++){
idx = i*N + j;
F_output[i][j] = arr[idx];
}
}
delete[] IPIV;
delete[] arr;
}
As you can see, this is a classical version of matrix inversion, which uses LAPACKE_dgetrf
and LAPACKE_dgetri
EDIT 2: The MAGMA
version is :
// MAGMA version
void matrix_inverse_magma(vector<vector<double>> const &F_matrix, vector<vector<double>> &F_output) {
// Index for loop and arrays
int i, j, ip, idx;
// Start magma part
magma_int_t m = F_matrix.size();
if (m) {
magma_init (); // initialize Magma
magma_queue_t queue=NULL;
magma_int_t dev=0;
magma_queue_create(dev ,&queue );
double gpu_time , *dwork; // dwork - workspace
magma_int_t ldwork; // size of dwork
magma_int_t *piv, info; // piv - array of indices of inter -
// changed rows; a - mxm matrix
magma_int_t mm=m*m; // size of a, r, c
double *a; // a- mxm matrix on the host
double *d_a; // d_a - mxm matrix a on the device
magma_int_t ione = 1;
magma_int_t ISEED [4] = { 0,0,0,1 }; // seed
magma_int_t err;
const double alpha = 1.0; // alpha =1
const double beta = 0.0; // beta=0
ldwork = m * magma_get_dgetri_nb( m ); // optimal block size
// allocate matrices
err = magma_dmalloc_cpu( &a , mm ); // host memory for a
for (i = 0; i<m; i++){
for (j = 0; j<m; j++){
idx = i*m + j;
a[idx] = F_matrix[i][j]
}
}
err = magma_dmalloc( &d_a , mm ); // device memory for a
err = magma_dmalloc( &dwork , ldwork );// dev. mem. for ldwork
piv=( magma_int_t *) malloc(m*sizeof(magma_int_t ));// host mem.
magma_dsetmatrix( m, m, a, m, d_a, m, queue); // copy a -> d_a
// find the inverse matrix: d_a*X=I using the LU factorization
// with partial pivoting and row interchanges computed by
// magma_dgetrf_gpu; row i is interchanged with row piv(i);
// d_a -mxm matrix; d_a is overwritten by the inverse
magma_dgetrf_gpu( m, m, d_a, m, piv, &info);
magma_dgetri_gpu(m, d_a, m, piv, dwork, ldwork, &info);
magma_dgetmatrix( m, m, d_a , m, a, m, queue); // copy d_a ->a
for (i = 0; i<m; i++){
for (j = 0; j<m; j++){
idx = i*m + j;
F_output[i][j] = a[idx];
}
}
free(a); // free host memory
free(piv); // free host memory
magma_free(d_a); // free device memory
magma_queue_destroy(queue); // destroy queue
magma_finalize ();
// End magma part
}
}
As you can see, I have used magma_dgetrf_gpu
and magma_dgetri_gpu
functions.
Now, I would like to do the same, either with LAPACKE
or MAGMA+LAPACK
, using dpotrf
and dpotri
functions. I recall that the matrixes that I inverse are symmetric.
EDIT 3: my attempts come from this documentation link
Especially, see section 4.4.21 magma dpotri - invert a positive definite matrix in double precision, CPU interface
on page 325.