I want to iterate through a large data structure in a Python program and perform a task for each element. For simplicity, let's say the elements are integers and the task is just an incrementation. In the end, the last incremented element is returned as (dummy) result. In search of the best structure/method to do this I compared timings in pure Python and Cython for these structures (I could not find a direct comparison of them elsewhere):
- Python list
- NumPy array / typed memory view
- Cython extension type with underlying C++ vector
The iterations I timed are:
- Python foreach in list iteration (it_list)
- Cython list iteration with explicit element access (cit_list)
- Python foreach in array iteration (it_nparray)
- Python NumPy vectorised operation (vec_nparray)
- Cython memory view iteration with explicit element access (cit_memview)
- Python foreach in underlying vector iteration (it_pyvector)
- Python foreach in underlying vector iteration via __iter__ (it_pyvector_iterator)
- Cython vector iteration with explicit element access (cit_pyvector)
- Cython vector iteration via vector.iterator (cit_pyvector_iterator)
I am concluding from this (timings are below):
- plain Python iteration over the NumPy array is extremely slow (about 10 times slower than the Python list iteration) -> not a good idea
- Python iteration over the wrapped C++ vector is slow, too (about 1.5 times slower than the Python list iteration) -> not a good idea
- Cython iteration over the wrapped C++ vector is the fastest option, approximately equal to the C contiguous memory view
- The iteration over the vector using explicit element access is slightly faster than using an iterator -> why bother to use an iterator?
- The memory view approach has comparably larger overhead than the extension type approach
My question is now: Are my numbers reliable (did I do something wrong or miss anything here)? Is this in line with your experience with real-world examples? Is there anything else I could do to improve the iteration? Below the code that I used and the timings. I am using this in a Jupyter notebook by the way. Suggestions and comments are highly appreciated!
Relative timings (minimum value 1.000), for different data structure sizes n:
================================================================================
Timings for n = 1:
--------------------------------------------------------------------------------
cit_pyvector_iterator: 1.000
cit_pyvector: 1.005
cit_list: 1.023
it_list: 3.064
it_pyvector: 4.230
it_pyvector_iterator: 4.937
cit_memview: 8.196
vec_nparray: 20.187
it_nparray: 25.310
================================================================================
================================================================================
Timings for n = 1000:
--------------------------------------------------------------------------------
cit_pyvector_iterator: 1.000
cit_pyvector: 1.001
cit_memview: 2.453
vec_nparray: 5.845
cit_list: 9.944
it_list: 137.694
it_pyvector: 199.702
it_pyvector_iterator: 218.699
it_nparray: 1516.080
================================================================================
================================================================================
Timings for n = 1000000:
--------------------------------------------------------------------------------
cit_pyvector: 1.000
cit_memview: 1.056
cit_pyvector_iterator: 1.197
vec_nparray: 2.516
cit_list: 7.089
it_list: 87.099
it_pyvector_iterator: 143.232
it_pyvector: 162.374
it_nparray: 897.602
================================================================================
================================================================================
Timings for n = 10000000:
--------------------------------------------------------------------------------
cit_pyvector: 1.000
cit_memview: 1.004
cit_pyvector_iterator: 1.060
vec_nparray: 2.721
cit_list: 7.714
it_list: 88.792
it_pyvector_iterator: 130.116
it_pyvector: 149.497
it_nparray: 872.798
================================================================================
Cython code:
%%cython --annotate
# distutils: language = c++
# cython: boundscheck = False
# cython: wraparound = False
from libcpp.vector cimport vector
from cython.operator cimport dereference as deref, preincrement as princ
# Extension type wrapping a vector
cdef class pyvector:
cdef vector[long] _data
cpdef void push_back(self, long x):
self._data.push_back(x)
def __iter__(self):
cdef size_t i, n = self._data.size()
for i in range(n):
yield self._data[i]
@property
def data(self):
return self._data
# Cython iteration over Python list
cpdef long cit_list(list l):
cdef:
long j, ii
size_t i, n = len(l)
for i in range(n):
ii = l[i]
j = ii + 1
return j
# Cython iteration over NumPy array
cpdef long cit_memview(long[::1] v) nogil:
cdef:
size_t i, n = v.shape[0]
long j
for i in range(n):
j = v[i] + 1
return j
# Iterate over pyvector
cpdef long cit_pyvector(pyvector v) nogil:
cdef:
size_t i, n = v._data.size()
long j
for i in range(n):
j = v._data[i] + 1
return j
cpdef long cit_pyvector_iterator(pyvector v) nogil:
cdef:
vector[long].iterator it = v._data.begin()
long j
while it != v._data.end():
j = deref(it) + 1
princ(it)
return j
Python code:
# Python iteration over Python list
def it_list(l):
for i in l:
j = i + 1
return j
# Python iteration over NumPy array
def it_nparray(a):
for i in a:
j = i + 1
return j
# Vectorised NumPy operation
def vec_nparray(a):
a + 1
return a[-1]
# Python iteration over C++ vector extension type
def it_pyvector_iterator(v):
for i in v:
j = i + 1
return j
def it_pyvector(v):
for i in v.data:
j = i + 1
return j
And for the benchmark:
import numpy as np
from operator import itemgetter
def bm(sizes):
"""Call functions with data structures of varying length"""
Timings = {}
for n in sizes:
Timings[n] = {}
# Python list
list_ = list(range(n))
# NumPy array
a = np.arange(n, dtype=np.int64)
# C++ vector extension type
pyv = pyvector()
for i in range(n):
pyv.push_back(i)
calls = [
(it_list, list_),
(cit_list, list_),
(it_nparray, a),
(vec_nparray, a),
(cit_memview, a),
(it_pyvector, pyv),
(it_pyvector_iterator, pyv),
(cit_pyvector, pyv),
(cit_pyvector_iterator, pyv),
]
for fxn, arg in calls:
Timings[n][fxn.__name__] = %timeit -o fxn(arg)
return Timings
def ratios(timings, base=None):
"""Show relative performance of runs based on `timings` dict"""
if base is not None:
base = timings[base].average
else:
base = min(x.average for x in timings.values())
return sorted([
(k, v.average / base)
for k, v in timings.items()
], key=itemgetter(1))
Timings = {}
sizes = [1, 1000, 1000000, 10000000]
Timings.update(bm(sizes))
for s in sizes:
print("=" * 80)
print(f"Timings for n = {s}:")
print("-" * 80)
for x in ratios(Timings[s]):
print(f"{x[0]:>25}: {x[1]:7.3f}")
print("=" * 80, "\n")