First of all, I know there are multiple threads that touch this issue, however I could not get a straight answer and encountered some flops miscalculations.
I have prepared a MATLAB and Python benchmark of element-wise multiplication. This is the simplest and most forward way one can easily calculate the flop count.
It uses NxN array (matrix) but does not do a matrix multiplication, rather an element wise multiplication. This is important because when using the matrix multiplication the number of operation is not N^3 !!!
The lower level algorithm that performs matrix multiplication does it in less than N^3 operations.
The execution of element-wise multiplication of randomly generated numbers however, will have to be performed in N^2 operations
I have an intel i7-4770 (which I think have 4 physical cores and 8 virtual cores) @ 3.5GHz. So if assuming 4flops per cycle that should be 14 GFLOPS per core!
MATLAB/Numpy/Scipy don't get anywhere near.
Why?
MATLAB:
%element wise multiplication benchmark
N = 10^4;
nOps = N^2;
m1 = randn(N);
m2 = randn(size(m1));
m = randn(size(m1));
m1 = single(m1);
m2 = single(m2);
% clear m
tic
m1 = m1 .* m2;
t = toc;
gflops = nOps/t*1e-9;
t_gflops = [t gflops]
% clear m
tic
m1 = m1.*m2;
t = toc;
gflops = nOps/t*1e-9;
t_gflops = [t gflops]
% clear m
tic
m1 = m1.*m2;
t = toc;
gflops = nOps/t*1e-9;
t_gflops = [t gflops]
version('-blas')
version('-lapack')
Which results in:
t_gflops =
0.0978 1.0226
t_gflops =
0.0743 1.3458
t_gflops =
0.0731 1.3682
ans =
Intel(R) Math Kernel Library Version 11.1.1 Product Build 20131010 for Intel(R) 64 architecture applications
ans =
Intel(R) Math Kernel Library Version 11.1.1 Product Build 20131010 for Intel(R) 64 architecture applications
Linear Algebra PACKage Version 3.4.1
And now Python:
import numpy as np
# import gnumpy as gnp
import scipy as sp
import scipy.linalg as la
import time
if __name__ == '__main__':
N = 10**4
nOps = N**2
a = np.random.randn(N,N).astype(np.float32)
b = np.random.randn(N,N).astype(np.float32)
t = time.time()
c = a*b
dt = time.time()-t
gflops = nOps/dt*1e-9
print("dt = ", dt, ", gflops = ", gflops)
t = time.time()
c = np.multiply(a, b)
dt = time.time()-t
gflops = nOps/dt*1e-9
print("dt = ", dt, ", gflops = ", gflops)
t = time.time()
c = sp.multiply(a, b)
dt = time.time()-t
gflops = nOps/dt*1e-9
print("dt = ", dt, ", gflops = ", gflops)
t = time.time()
c = sp.multiply(a, b)
dt = time.time()-t
gflops = nOps/dt*1e-9
print("dt = ", dt, ", gflops = ", gflops)
a = np.random.randn(N,1).astype(np.float32)
b = np.random.randn(1,N).astype(np.float32)
t = time.time()
c1 = np.dot(a, b)
dt = time.time()-t
gflops = nOps/dt*1e-9
print("dt = ", dt, ", gflops = ", gflops)
t = time.time()
c = np.dot(a, b)
dt = time.time()-t
gflops = nOps/dt*1e-9
print("dt = ", dt, ", gflops = ", gflops)
t = time.time()
c = la.blas.dgemm(1.0,a,b)
dt = time.time()-t
gflops = nOps/dt*1e-9
print("dt = ", dt, ", gflops = ", gflops)
t = time.time()
c = la._fblas.dgemm(1.0,a,b)
dt = time.time()-t
gflops = nOps/dt*1e-9
print("dt = ", dt, ", gflops = ", gflops)
print("numpy config")
np.show_config()
print("scipy config")
sp.show_config()
# numpy
Which result in:
dt = 0.16301608085632324 , gflops = 0.6134364136022663
dt = 0.16701674461364746 , gflops = 0.5987423610209003
dt = 0.1770176887512207 , gflops = 0.5649152957845881
dt = 0.188018798828125 , gflops = 0.5318617107612401
dt = 0.151015043258667 , gflops = 0.6621856858903415
dt = 0.17201733589172363 , gflops = 0.5813367558659613
dt = 0.3080308437347412 , gflops = 0.3246428142959423
dt = 0.39503931999206543 , gflops = 0.253139358385916
numpy config
mkl_info:
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
libraries = ['mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
library_dirs = ['C:\\Minonda\\envs\\_build\\Library\\lib']
include_dirs = ['C:\\Minonda\\envs\\_build\\Library\\include']
lapack_mkl_info:
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
libraries = ['mkl_lapack95_lp64', 'mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
library_dirs = ['C:\\Minonda\\envs\\_build\\Library\\lib']
include_dirs = ['C:\\Minonda\\envs\\_build\\Library\\include']
lapack_opt_info:
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
libraries = ['mkl_lapack95_lp64', 'mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
library_dirs = ['C:\\Minonda\\envs\\_build\\Library\\lib']
include_dirs = ['C:\\Minonda\\envs\\_build\\Library\\include']
blas_opt_info:
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
libraries = ['mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
library_dirs = ['C:\\Minonda\\envs\\_build\\Library\\lib']
include_dirs = ['C:\\Minonda\\envs\\_build\\Library\\include']
openblas_lapack_info:
NOT AVAILABLE blas_mkl_info:
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
libraries = ['mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
library_dirs = ['C:\\Minonda\\envs\\_build\\Library\\lib']
include_dirs = ['C:\\Minonda\\envs\\_build\\Library\\include']
scipy config
mkl_info:
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
libraries = ['mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
library_dirs = ['C:\\Minonda\\envs\\_build\\Library\\lib']
include_dirs = ['C:\\Minonda\\envs\\_build\\Library', 'C:\\Minonda\\envs\\_build\\Library\\include', 'C:\\Minonda\\envs\\_build\\Library\\lib']
lapack_mkl_info:
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
libraries = ['mkl_lapack95_lp64', 'mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
library_dirs = ['C:\\Minonda\\envs\\_build\\Library\\lib']
include_dirs = ['C:\\Minonda\\envs\\_build\\Library', 'C:\\Minonda\\envs\\_build\\Library\\include', 'C:\\Minonda\\envs\\_build\\Library\\lib']
lapack_opt_info:
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
libraries = ['mkl_lapack95_lp64', 'mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
library_dirs = ['C:\\Minonda\\envs\\_build\\Library\\lib']
include_dirs = ['C:\\Minonda\\envs\\_build\\Library', 'C:\\Minonda\\envs\\_build\\Library\\include', 'C:\\Minonda\\envs\\_build\\Library\\lib']
blas_opt_info:
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
libraries = ['mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
library_dirs = ['C:\\Minonda\\envs\\_build\\Library\\lib']
include_dirs = ['C:\\Minonda\\envs\\_build\\Library', 'C:\\Minonda\\envs\\_build\\Library\\include', 'C:\\Minonda\\envs\\_build\\Library\\lib']
openblas_lapack_info:
NOT AVAILABLE blas_mkl_info:
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
libraries = ['mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
library_dirs = ['C:\\Minonda\\envs\\_build\\Library\\lib']
include_dirs = ['C:\\Minonda\\envs\\_build\\Library', 'C:\\Minonda\\envs\\_build\\Library\\include', 'C:\\Minonda\\envs\\_build\\Library\\lib']
Process finished with exit code 0