Intro
I have a fairly simple Cython module - which I simplified even further for the specific tests I have carried on.
This module has only one class, which has only one interesting method (named run
): this method accepts as an input a Fortran-ordered 2D NumPy array and two 1D NumPy arrays, and does some very, very simple things on those (see below for code).
For the sake of benchmarking, I have compiled the exact same module with MSVC, GCC 8, GCC 11, GCC 12 and GCC 13 on Windows 10 64 bit, Python 3.9.10 64 bit, Cython 0.29.32. All the GCC compilers I have obtained from the excellent Brecht Sanders GitHub page (https://github.com/brechtsanders).
Main Question
The overarching question of this very long post is: I am just curious to know if anyone has any explanation regarding why GCC12 and GCC13 are so much slower than GCC11 (which is the fastest of all). Looks like performances are going down at each release of GCC, rather than getting better...
Benchmarks
In the benchmarking, I simply vary the array dimensions of the 2D and 1D arrays (m
and n
) and the number on nonzero entries in the 2D and 1D arrays. I repeat the run
method 20 times per compiler version, per set of m
and n
and nonzero entries.
Optimization settings I am using:
MVSC
MSVC_EXTRA_COMPILE_ARGS = ['/O2', '/GS-', '/fp:fast', '/Ob2', '/nologo', '/arch:AVX512', '/Ot', '/GL']
GCC
GCC_EXTRA_COMPILE_ARGS = ['-Ofast', '-funroll-loops', '-flto', '-ftree-vectorize', '-march=native', '-fno-asynchronous-unwind-tables']
GCC_EXTRA_LINK_ARGS = ['-flto'] + GCC_EXTRA_COMPILE_ARGS
What I am observing is the following:
- MSVC is by far the slowest at executing the benchmark (why would that be on Windows?)
- The progression GCC8 -> GCC11 is promising, as GCC11 is faster than GCC8
- GCC12 and GCC13 are both significantly slower than GCC11, with GCC13 being the worst (twice as slow as GCC11 and much worse than GCC12)
Table of Results:
Runtimes are in milliseconds (ms)
Graph (NOTE: Logarithmic Y axis!!):
Runtimes are in milliseconds (ms)
Code
Cython file:
###############################################################################
import numpy as np
cimport numpy as np
import cython
from cython.view cimport array as cvarray
from libc.float cimport FLT_MAX
DTYPE_float = np.float32
ctypedef np.float32_t DTYPE_float_t
cdef float SMALL = 1e-10
cdef int MAXSIZE = 1000000
cdef extern from "math.h" nogil:
cdef float fabsf(float x)
###############################################################################
@cython.final
cdef class CythonLoops:
cdef int m, n
cdef int [:] col_starts
cdef int [:] row_indices
cdef double [:] x
def __cinit__(self):
self.m = 0
self.n = 0
self.col_starts = cvarray(shape=(MAXSIZE,), itemsize=sizeof(int), format='i')
self.row_indices = cvarray(shape=(MAXSIZE,), itemsize=sizeof(int), format='i')
self.x = cvarray(shape=(MAXSIZE,), itemsize=sizeof(double), format='d')
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
@cython.nonecheck(False)
@cython.initializedcheck(False)
@cython.cdivision(True)
cpdef run(self, DTYPE_float_t[::1, :] matrix,
DTYPE_float_t[:] ub_values,
DTYPE_float_t[:] priority):
cdef Py_ssize_t i, j, m, n
cdef int nza, collen
cdef double too_large, ok, obj
cdef float ub, element
cdef int [:] col_starts = self.col_starts
cdef int [:] row_indices = self.row_indices
cdef double [:] x = self.x
m = matrix.shape[0]
n = matrix.shape[1]
self.m = m
self.n = n
nza = 0
collen = 0
for i in range(n):
for j in range(m+1):
if j == 0:
element = priority[i]
else:
element = matrix[j-1, i]
if fabsf(element) < SMALL:
continue
if j == 0:
obj = <double>element
# Do action 1 with external library
else:
collen = nza + 1
col_starts[collen] = i+1
row_indices[collen] = j
x[collen] = <double>element
nza += 1
ub = ub_values[i]
if ub > FLT_MAX:
too_large = 0.0
# Do action 2 with external library
elif ub > SMALL:
ok = <double>ub
# Do action 3 with external library
# Use x, row_indices and col_starts in the external library
Setup file:
I use the following to compile it:
python setup.py build_ext --inplace --compiler=mingw32 gcc13
Where the last argument is the compiler I want to test
#!/usr/bin/env python
from setuptools import setup
from setuptools import Extension
from Cython.Build import cythonize
from Cython.Distutils import build_ext
import numpy as np
import os
import shutil
import sys
import getpass
MODULE = 'loop_cython_%s'
GCC_EXTRA_COMPILE_ARGS = ['-Ofast', '-funroll-loops', '-flto', '-ftree-vectorize', '-march=native', '-fno-asynchronous-unwind-tables']
GCC_EXTRA_LINK_ARGS = ['-flto'] + GCC_EXTRA_COMPILE_ARGS
MSVC_EXTRA_COMPILE_ARGS = ['/O2', '/GS-', '/fp:fast', '/Ob2', '/nologo', '/arch:AVX512', '/Ot', '/GL']
MSVC_EXTRA_LINK_ARGS = MSVC_EXTRA_COMPILE_ARGS
def remove_builds(kind):
for folder in ['build', 'bin']:
if os.path.isdir(folder):
if folder == 'bin':
continue
shutil.rmtree(folder, ignore_errors=True)
if os.path.isfile(MODULE + '_%s.c'%kind):
os.remove(MODULE + '_%s.c'%kind)
def setenv(extra_args, doset=True, path=None, kind='gcc8'):
flags = ''
if doset:
flags = ' '.join(extra_args)
for key in ['CFLAGS', 'FFLAGS', 'CPPFLAGS']:
os.environ[key] = flags
user = getpass.getuser()
if doset:
path = os.environ['PATH']
if kind == 'gcc8':
os.environ['PATH'] = r'C:\Users\%s\Tools\MinGW64_8.0\bin;C:\WINDOWS\system32;C:\WINDOWS;C:\Users\%s\WinPython39\WPy64-39100\python-3.9.10.amd64;'%(user, user)
elif kind == 'gcc11':
os.environ['PATH'] = r'C:\Users\%s\Tools\MinGW64\bin;C:\WINDOWS\system32;C:\WINDOWS;C:\Users\%s\WinPython39\WPy64-39100\python-3.9.10.amd64;'%(user, user)
elif kind == 'gcc12':
os.environ['PATH'] = r'C:\Users\%s\Tools\MinGW64_12.2.0\bin;C:\WINDOWS\system32;C:\WINDOWS;C:\Users\%s\WinPython39\WPy64-39100\python-3.9.10.amd64;'%(user, user)
elif kind == 'gcc13':
os.environ['PATH'] = r'C:\Users\%s\Tools\MinGW64_13.2.0\bin;C:\WINDOWS\system32;C:\WINDOWS;C:\Users\%s\WinPython39\WPy64-39100\python-3.9.10.amd64;'%(user, user)
elif kind == 'msvc':
os.environ['PATH'] = r'C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.35.32215\bin\Hostx64\x64;C:\WINDOWS\system32;C:\WINDOWS;C:\Users\J0514162\WinPython39\WPy64-39100\python-3.9.10.amd64;C:\Program Files (x86)\Windows Kits\10\bin\10.0.22000.0\x64'
os.environ['LIB'] = r'C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.35.32215\lib\x64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22000.0\um\x64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22000.0\ucrt\x64'
os.environ["DISTUTILS_USE_SDK"] = '1'
os.environ["MSSdk"] = '1'
else:
os.environ['PATH'] = path
return path
class CustomBuildExt(build_ext):
def build_extensions(self):
# Override the compiler executables. Importantly, this
# removes the "default" compiler flags that would
# otherwise get passed on to to the compiler, i.e.,
# distutils.sysconfig.get_var("CFLAGS").
self.compiler.set_executable("compiler_so", "gcc -mdll -O -Wall -DMS_WIN64")
self.compiler.set_executable("compiler_cxx", "g++ -O -Wall -DMS_WIN64")
self.compiler.set_executable("linker_so", "gcc -shared -static")
self.compiler.dll_libraries = []
build_ext.build_extensions(self)
if __name__ == '__main__':
os.system('cls')
kind = None
for arg in sys.argv:
if arg.strip() in ['gcc8', 'gcc11', 'gcc12', 'gcc13', 'msvc']:
kind = arg
sys.argv.remove(arg)
break
base_file = os.path.join(os.getcwd(), MODULE[0:-3])
source = base_file + '.pyx'
target = base_file + '_%s.pyx'%kind
shutil.copyfile(source, target)
if kind == 'msvc':
extra_compile_args = MSVC_EXTRA_COMPILE_ARGS[:]
extra_link_args = MSVC_EXTRA_LINK_ARGS[:] + ['/MANIFEST']
else:
extra_compile_args = GCC_EXTRA_COMPILE_ARGS[:]
extra_link_args = GCC_EXTRA_LINK_ARGS[:]
path = setenv(extra_compile_args, kind=kind)
remove_builds(kind)
define_macros = [('WIN32', 1)]
nname = MODULE%kind
include_dirs = [np.get_include()]
if kind == 'msvc':
include_dirs += [r'C:\Program Files (x86)\Windows Kits\10\Include\10.0.22000.0\ucrt',
r'C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.35.32215\include',
r'C:\Program Files (x86)\Windows Kits\10\Include\10.0.22000.0\shared']
extensions = [
Extension(nname, [nname + '.pyx'],
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args,
include_dirs=include_dirs,
define_macros=define_macros)]
# build the core extension(s)
setup_kwargs = {'ext_modules': cythonize(extensions,
compiler_directives={'embedsignature' : False,
'boundscheck' : False,
'wraparound' : False,
'initializedcheck': False,
'cdivision' : True,
'language_level' : '3str',
'nonecheck' : False},
force=True,
cache=False,
quiet=False)}
if kind != 'msvc':
setup_kwargs['cmdclass'] = {'build_ext': CustomBuildExt}
setup(**setup_kwargs)
setenv([], False, path)
remove_builds(kind)
Test code:
import os
import numpy
import time
import loop_cython_msvc as msvc
import loop_cython_gcc8 as gcc8
import loop_cython_gcc11 as gcc11
import loop_cython_gcc12 as gcc12
import loop_cython_gcc13 as gcc13
# M N NNZ(matrix) NNZ(priority) NNZ(ub)
DIMENSIONS = [(1661 , 2608 , 3560 , 375 , 2488 ),
(2828 , 3512 , 4333 , 413 , 2973 ),
(780 , 985 , 646 , 23 , 984 ),
(799 , 1558 , 1883 , 301 , 1116 ),
(399 , 540 , 388 , 44 , 517 ),
(10545, 10486, 14799 , 1053 , 10041),
(3369 , 3684 , 3684 , 256 , 3242 ),
(2052 , 5513 , 4772 , 1269 , 3319 ),
(224 , 628 , 1345 , 396 , 594 ),
(553 , 1475 , 1315 , 231 , 705 )]
def RunTest():
print('M N NNZ MSVC GCC 8 GCC 11 GCC 12 GCC 13')
for m, n, nnz_mat, nnz_priority, nnz_ub in DIMENSIONS:
print('%-6d %-6d %-8d'%(m, n, nnz_mat), end='')
for solver, label in zip([msvc, gcc8, gcc11, gcc12, gcc13], ['MSVC', 'GCC 8', 'GCC 11', 'GCC 12', 'GCC 13']):
numpy.random.seed(123456)
size = m*n
idxes = numpy.arange(size)
matrix = numpy.zeros((size, ), dtype=numpy.float32)
idx_mat = numpy.random.choice(idxes, nnz_mat)
matrix[idx_mat] = numpy.random.uniform(0, 1000, size=(nnz_mat, ))
matrix = numpy.asfortranarray(matrix.reshape((m, n)))
idxes = numpy.arange(m)
priority = numpy.zeros((m, ), dtype=numpy.float32)
idx_pri = numpy.random.choice(idxes, nnz_priority)
priority[idx_pri] = numpy.random.uniform(0, 1000, size=(nnz_priority, ))
idxes = numpy.arange(n)
ub_values = numpy.inf*numpy.ones((n, ), dtype=numpy.float32)
idx_ub = numpy.random.choice(idxes, nnz_ub)
ub_values[idx_ub] = numpy.random.uniform(0, 1000, size=(nnz_ub, ))
solver = solver.CythonLoops()
time_tot = []
for i in range(20):
start = time.perf_counter()
solver.run(matrix, ub_values, priority)
elapsed = time.perf_counter() - start
time_tot.append(elapsed*1e3)
print('%-8.4g'%numpy.mean(time_tot), end=' ')
print()
if __name__ == '__main__':
os.system('cls')
RunTest()
EDIT
After @PeterCordes comments, I have changed the optimization flags to this:
MSVC_EXTRA_COMPILE_ARGS = ['/O2', '/GS-', '/fp:fast', '/Ob2', '/nologo', '/arch:AVX512', '/Ot', '/GL', '/QIntel-jcc-erratum']
GCC_EXTRA_COMPILE_ARGS = ['-Ofast', '-funroll-loops', '-flto', '-ftree-vectorize', '-march=native', '-fno-asynchronous-unwind-tables', '-Wa,-mbranches-within-32B-boundaries']
MSVC appears to be marginally faster than before (between 5% and 10%), but GCC12 and GCC13 are slower than before (between 3% and 20%). Below a graph with the results on the largest 2D matrix:
Note: "Current" means with the latest optimization flags suggested by @PeterCordes, "Previous" is the original set of flags.