wrapping the ScaLAPACK functions in cython

Question

I used lapack and blas library to implement matrix-inversion and multiply matrices in cython. Here is my code:

import numpy as np
cimport numpy as np
cimport cython
import ctypes    
cdef extern from "f2pyptr.h": 
     void *f2py_pointer(object) except NULL

import scipy.linalg.lapack 
import scipy.linalg.blas
ctypedef np.float64_t REAL_t

ctypedef void (*dgetri_ptr) (long *N, double *A, long *LDA, long *IPIV, double *WORK, long *LWORK, long *INFO) nogil
cdef dgetri_ptr dgetri = <dgetri_ptr>f2py_pointer(scipy.linalg.lapack.dgetri._cpointer) 

ctypedef void (*dgetrf_ptr) (long *M, long *N, double *A, long *LDA, long *IPIV, long *INFO) nogil
cdef dgetrf_ptr dgetrf = <dgetrf_ptr>f2py_pointer(scipy.linalg.lapack.dgetrf._cpointer) 

cdef void inverse_matrix(REAL_t* A, long n, long info):
     #Computes the inverse of an LU-factored general matrix using sgetri routine.
     #This routine computes the inverse inv(A) of a general matrix A. Before calling this routine, call sgetrf to factorize A
     cdef np.ndarray[long, ndim=1] ipiv   = np.empty(n, dtype=np.int64)
     cdef np.ndarray[double, ndim=1] work = np.empty(n, dtype=np.float64)
     dgetrf(&n, &n, A, &n, &ipiv[0], &info)
     dgetri(&n, A, &n, &ipiv[0], &work[0], &n, &info)


@cython.wraparound(False)
@cython.boundscheck(False)
cpdef REAL_t[::1,:] inverse(REAL_t[::1,:] A):
      if (A.shape[0] !=A.shape[1]):
         raise TypeError("The input array must be a square matrix")
      cdef long N = A.shape[0]
      cdef long info
      A=np.require(A, requirements=['F'])
      inverse_matrix(&A[0,0], N, info)

      if info < 0:
          raise ValueError('illegal value in %d-th argument of internal getrf' % -info)
      elif info > 0:
          raise ValueError("Diagonal number %d is exactly zero. Singular matrix." % -info)
      return A

ctypedef void (*dgetri_ptr) (long *N, double *A, long *LDA, long *IPIV, double *WORK, long *LWORK, long *INFO) nogil
cdef dgetri_ptr dgetri = <dgetri_ptr>f2py_pointer(scipy.linalg.lapack.dgetri._cpointer) 
ctypedef void (*dgemm_ptr) (char *TRANSA, char *TRANSB, long *M, long *N, long *K, double *ALPHA, double *A, long *LDA, double *B, long *LDB, double *BETA, double *C, long *LDC) nogil  
cdef dgemm_ptr dgemm = <dgemm_ptr>f2py_pointer(scipy.linalg.blas.dgemm._cpointer) 
@cython.wraparound(False)
@cython.boundscheck(False)
cpdef REAL_t[::1,:] dot_product(REAL_t[::1,:] a, REAL_t[::1,:] b, char* TRANSA ='N', char* TRANSB='N', REAL_t ALPHA = 1.0, REAL_t BETA  = 0.0):

      cdef REAL_t[::1,:] c_view = np.zeros((a.shape[0], b.shape[1]), dtype=np.float64, order='F') 
      cdef long M, N, K, LDA, LDB, LDC
      cdef char* cN = 'N'
      cdef char* cT = 'T'
      if (a.shape[1] !=b.shape[0]):
         raise TypeError("The numbers of columns in matrix a must be equal to number of rows in matrix b")
      if (TRANSA   == cN):
         M = a.shape[0] #the number  of rows  of the  matrix op( A )  and of the  matrix  C
      elif (TRANSA == cT):
         M = a.shape[1]
      if (TRANSB   == cN):
         N = b.shape[1]
      elif (TRANSB == cT):
         N = b.shape[0]
      K   = a.shape[1]
      LDA = a.shape[0]
      LDB = b.shape[0]
      LDC = a.shape[0]
      dgemm(TRANSA, TRANSB, &M, &N, &K, &ALPHA, &a[0,0], &LDA, &b[0,0], &LDB, &BETA, &c_view[0,0], &LDC)
      return c_view

I would like to improve the speed of my code by splitting up the computations and running them on multiple processors. How can I extend this code and use the scalapack library by wrapping following functions in my cython code:

pdgetri which computes the inverse of a distributed matrix using the LU factorization.
pdgetrf which computes an LU factorization of a general M-by-N distributed matrix, using partial pivoting with row interchanges.
pdtran_ that transposes a matrix.
pdgemm_ which performs one of the matrix-matrix multiplication.

Update I: In order to be able to use pdgetri or pdgetrf should I use the following approach to wrap them from their fortran scripts and for pdgemm_ the c script?

scalapack_wrap.f90

subroutine C_PDGETRI( N, A, IA, JA, DESCA, IPIV, WORK, LWORK, IWORK, LIWORK, INFO ) bind(c, NAME='C_PDGETRI')
           use iso_c_binding, only: c_double, c_int
           implicit none

           real(c_double), dimension(N, N), intent(in):: A
           real(c_double), dimension(LWORK), intent(in):: WORK
           integer(c_int), dimension(N), intent(in):: DESCA
           integer(c_int), dimension(N), intent(in)::IPIV
           integer(c_int), dimension(LWORK), intent(in):: IWORK
           integer(c_int), intent(in) :: N, IA, JA, LIWORK, LWORK, INFO
           call PDGETRI( N, A, IA, JA, DESCA, IPIV, WORK, LWORK, IWORK, LIWORK, INFO )
end subroutine C_PDGETRI

subroutine C_PDGETRF(M, N, A, IA, JA, DESCA, IPIV, INFO) bind(c, NAME='C_PDGETRF')
           use iso_c_binding, only: c_double, c_int
           implicit none

           real(c_double), dimension(M, N), intent(in):: A
           integer(c_int), dimension(N), intent(in):: DESCA
           integer(c_int), dimension(N), intent(in)::IPIV
           integer(c_int), intent(in) :: M, N, IA, JA, INFO
           call PDGETRF( M, N, A, IA, JA, DESCA, IPIV, INFO )
end subroutine C_PDGETRF

scalapack_func.h

extern void C_PDGETRI( long* N, double* A, int* IA, int* JA, int* DESCA, int* IPIV, double* WORK, long* LWORK, int* IWORK, long* LIWORK, int* INFO );
extern void C_PDGETRF( long* M, long* N, double* A, int* IA, int* JA, int* DESCA, int* IPIV, int* INFO);

scalapack_funcs.pyx

import numpy as np
cimport numpy as np
cimport cython
cdef extern from "scalapack_func.h":
     void C_PDGETRI( long* N, double* A, int* IA, int* JA, int* DESCA, int* IPIV, double* WORK, long* LWORK, int* IWORK, long* LIWORK, int* INFO);
     void C_PDGETRF( long* M, long* N, double* A, int* IA, int* JA, int* DESCA, int* IPIV, int* INFO);

cdef double[:,::1] pdgetri_(double[:,::1] A):
     cdef long N      = A.shape[0]
     cdef long LWORK  = A.shape[1]
     cdef long LIWORK = A.shape[1]
     cdef int INFO    = 0
     cdef int IA      = 0 #the row index in the global array A indicating the first row of sub( A )
     cdef int JA      = 0 #The column index in the global array A indicating the first column of sub( A ).
     cdef double[::1] WORK = np.empty(LWORK, dtype=np.float64)
     cdef int[::1] IWORK   = np.empty(LIWORK, dtype=np.int32)
     cdef int[::1] IPIV    = np.empty(N, dtype=np.int32)
     cdef int[::1] DESCA   = np.empty(N, dtype=np.int32)
     C_PDGETRI(&N, &A[0,0],  &IA, &JA, &DESCA[0], &IPIV[0], &WORK[0], &LWORK, &IWORK[0], &LIWORK, &INFO)
     if INFO < 0:
         raise ValueError('illegal value in %d-th argument of internal pdgetri' % -INFO)
     elif INFO > 0:
         raise ValueError("Diagonal number %d is exactly zero. Singular matrix." % -INFO)
     return A 

cdef double[:,::1] pdgetrf_(double[:,::1] A):
     cdef long M      = A.shape[0] #The number of rows to be operated on,
     cdef long N      = A.shape[1] #The number of columns to be operated on
     cdef int INFO    = 0
     cdef int IA      = 0 #the row index in the global array A indicating the first row of sub( A )
     cdef int JA      = 0 #The column index in the global array A indicating the first column of sub( A ).
     cdef int[::1] IPIV    = np.empty(M, dtype=np.int32)
     cdef int[::1] DESCA   = np.empty(M, dtype=np.int32)
     C_PDGETRF(&M, &N, &A[0,0], &IA, &JA, &DESCA[0], &IPIV[0], &INFO)
     if INFO < 0:
         raise ValueError('illegal value in %d-th argument of internal pdgetrf' % -INFO)
     elif INFO > 0:
         raise ValueError("Diagonal number %d is exactly zero. Singular matrix." % -INFO)
     return A   
@cython.wraparound(False)
@cython.boundscheck(False)
cpdef double[:,::1] inverse(double[:,::1] A):     
      if (A.shape[0] !=A.shape[1]):
          raise TypeError("The input array must be a square matrix")
      A=np.require(A, requirements=['F'])
      pdgetrf_(A)
      pdgetri_(A)
      return A

Update II:

setup.py

from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
# This line only needed if building with NumPy in Cython file.
from numpy import get_include
import subprocess
import os
from os import system
import warnings
import mpi4py

fortran_mod_comp = 'mpif90 /usr/local/scalapack-2.0.2/SRC/pdgetri.f -c -o pdgetri_.o -O3 -fPIC'
print fortran_mod_comp
system(fortran_mod_comp)

fortran_mod_comp = 'mpif90 /usr/local/scalapack-2.0.2/SRC/pdgetrf.f -c -o pdgetrf_.o -O3 -fPIC'
print fortran_mod_comp
system(fortran_mod_comp)

shared_obj_comp = 'mpif90 scalapack_wrap.f90 -c -o scalapack_wrap.o -O3 -fPIC'
print shared_obj_comp
system(shared_obj_comp)
def runcommand(cmd):
    process = subprocess.Popen(cmd.split(), shell=False, stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT, universal_newlines=True)
    c = process.communicate()
    if process.returncode != 0:
        raise Exception("Something went wrong whilst running the command: %s" % cmd)
    return c[0]

def whichscalapack():
    # Figure out which Scalapack to use
    if 'MKLROOT' in os.environ:
        return 'intelmkl'
    else:
        return 'netlib'

def whichmpi():
    # Figure out which MPI environment this is
    import re
    try:
        mpiv = runcommand('mpirun -V')
        if re.search('Intel', mpiv):
            return 'intelmpi'
        elif re.search('Open MPI', mpiv):
            return 'openmpi'
    except:
        return 'mpich'
    warnings.warn('Unknown MPI environment.')
    return None

scalapackversion = whichscalapack()
mpiversion = whichmpi()
if mpiversion == 'openmpi':
    # Fetch the arguments for linking and compiling.
    mpilinkargs = runcommand('mpicc -showme:link').split()
    mpicompileargs = runcommand('mpicc -showme:compile').split()

if scalapackversion == 'intelmkl':
    # Set library includes (taking into account which MPI library we are using)."
    scl_lib = ['mkl_scalapack_lp64', 'mkl_rt', 'mkl_blacs_'+mpiversion+'_lp64', 'iomp5', 'pthread']
    scl_libdir = [os.environ['MKLROOT']+'/lib/intel64' if 'MKLROOT' in os.environ else '']
elif scalapackversion == 'netlib':
    scl_lib = ['scalapack', 'gfortran']
    scl_libdir = [ os.path.dirname(runcommand('gfortran -print-file-name=libgfortran.a')) ]
else:
    raise Exception("Scalapack distribution unsupported. Please modify setup.py manually.")


ext_modules = [Extension(# module name:
                         'scalapack_funcs',
                         # source file:
                         sources=['scalapack_funcs.pyx'],
                         # Needed if building with NumPy. This includes the NumPy headers when compiling.
                         include_dirs = [get_include(), mpi4py.get_include()],
                         #include libraries 
                         library_dirs=scl_libdir, libraries=scl_lib,
                         #libraries=["scalapack"],
                         #library_dirs=[library_dirs],
                         # other compile args for gcc
                         extra_compile_args=[ "-fopenmp", "-lmkl_lapack95_lp64", "-lmkl_core", "-lmkl_intel_lp64",  "-lmkl_rt", "-lmkl_intel_thread" ] + mpicompileargs,
                         # other files to link to
                         extra_link_args=['pdgetrf_.o', 'pdgetri_.o', 'scalapack_wrap.o','-fopenmp', "-lmkl_lapack95_lp64", "-lmkl_core", "-lmkl_intel_lp64", "-lmkl_rt", "-lmkl_intel_thread"] + mpilinkargs)]

setup(
      cmdclass = {'build_ext': build_ext},
      ext_modules = ext_modules
      )

The code gets compiled but

>>> import scalapack_funcs
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
ImportError: /opt/intel/mkl/lib/intel64/libmkl_scalapack_lp64.so: undefined symbol: dapply_2hv

As danny pointed out maybe the problem is just correctly compiling the code with the scalapack library. But I am still getting above error message. Any suggestion how to get rid of it?

Have you seen https://docs.scipy.org/doc/scipy/reference/linalg.cython_blas.html#module-scipy.linalg.cython_blas? You can probably use that instead of the `f2pyptr`. (This doesn't help with your main question though) — DavidW, Feb 01 '18 at 20:53
@DavidW How much would it improve the speed of code? I guess not that much, right? — Dalek, Feb 01 '18 at 21:03
Almost nothing I expect - it's possibly just easier to write — DavidW, Feb 01 '18 at 21:04
@DavidW I updated my question, what approach would you suggest to use `scalapack` in cython? — Dalek, Feb 02 '18 at 22:37
I think Saullo Castro's got the function wrapping right. However: I think that using `scalapack` is probably a bad idea. It using `mpi` for multithreading - I don't know too much about `mpi` but I think you have to start multiple versions of your program, decide internally which one is the main one, pass your matrix around to the other thread, then call your matrix functions. It looks like it needs big changes to how your whole function is structured. — DavidW, Feb 03 '18 at 09:29
About your most recent edit: you might need to run Python with `mpi_run` or something similar. However, as I said before, using MPI involves _big_ changes to your whole program. — DavidW, Feb 07 '18 at 17:44
@DavidW I think the ordering of `mkl` components or including other libraries might solve this current problem but I am not very familiar with `mkl`. If you have any other advice or thoughts I will appreciate. — Dalek, Feb 07 '18 at 17:49

Saullo G. P. Castro · Answer 1 · 2018-02-02T23:41:31.953

It seems you got it almost right. Add the specific header file from which you are importing the function, and use the correct name of the FORTRAN function C_PDGETRI. When compiling, be sure to add proper references to "your_fortran_header.h" and the corresponding, previously compiled, FORTRAN static library.

cdef extern from "your_fortran_header.h":
     void C_PDGETRI( long* N, double* A, int* IA, int* JA, int* DESCA, int* IPIV, double* WORK, long* LWORK, int* IWORK, int* LIWORK, int* INFO )

cdef double[:,::1] pdgetri_(double[:,::1] A not None):
     cdef long N = A.shape[0]
     cdef long LWORK = A.shape[1]
     cdef long LIWORK = A.shape[1]
     cdef int INFO = 0
     cdef long IA = 0 #the row index in the global array A indicating the first row of sub( A )
     cdef long JA = 0 #The column index in the global array A indicating the first column of sub( A ).
     cdef double[::1] WORK = np.empty(LWORK, dtype=np.float64)
     cdef int[::1] IWORK = np.empty(LIWORK, dtype=np.int32)
     cdef int[::1] IPIV = np.empty(N, dtype=np.int32)
     cdef int[::1] DESCA = np.empty(N, dtype=np.int32)
     C_PDGETRI(&N, &A[0,0], &IA, &JA, &DESCA[0], &IPIV[0], &WORK[0], &LWORK, &IWORK[0], &LIWORK, &INFO)
     return A

I have been trying to get the code working using your answer, finally I got rid of all error messages just the final one `ImportError: ./scalapack_funcs.so: undefined symbol: ilcm_` and I have no idea how to deal with this one, any suggestion? Thanks. — Dalek, Feb 05 '18 at 18:13

score 1 · Answer 2 · answered Feb 07 '18 at 12:21

1

The extension needs to be linked to the libraries it is using. undefined symbol means a library included in headers is not linked to the shared object and that library's symbol cannot be found.

Presumably ilcm is from scalapack, not sure what the library name is. Add a libraries setting to the Extension with all the libraries it requires.

Extension('scalapack_funcs',
          sources=['scalapack_funcs.pyx'],
          libraries=['scalapack'],
          extra_compile_args=['-fPIC', '-O3'],
          extra_link_args=['pdgetri_.o', 'pdgetrf_.o', 'scalapack_wrap.o'])]

answered Feb 07 '18 at 12:21

danny

5,140
1
19
31

Thanks for the answer but I tried already your suggestion and it did not work. You are right, [ilcm](http://www.netlib.org/scalapack/explore-html/d0/d9b/ilcm_8f_source.html) is a `fortran function` in the `ScaLAPACK` library. – Dalek Feb 07 '18 at 12:44
I guess maybe you are right. I just need to compile the code with `scalapack` library correctly. I changed my `setup.py` code and I got new errors. What do you suggest? – Dalek Feb 07 '18 at 17:04
It is a similar error, same cause (missing linked library). I'd suggest using an actual supported OS for what you are trying to do. Scalapack compiles and runs fine on Linux. Docker or a VM is always an option if you do not have a Linux OS handy. – danny Feb 07 '18 at 17:08

wrapping the ScaLAPACK functions in cython

2 Answers2