I was tired of waiting while loading a simple distance matrix from a csv
file using numpy.genfromtxt
. Following another SO question, I performed a perfplot test, while including some additional methods. The results (source code at the end):
The result for the largest input size shows that the best method is read_csv
, which is this:
def load_read_csv(path: str):
with open(path, 'r') as csv_file:
reader = csv.reader(csv_file)
matrix = None
first_row = True
for row_index, row in enumerate(reader):
if first_row:
size = len(row)
matrix = np.zeros((size, size), dtype=int)
first_row = False
matrix[row_index] = row
return matrix
Now I doubt that reading the file line by line, converting it to the list of strings, then calling int()
on each item in the list and adding it to NumPy matrix is the best possible way.
Can this function be optimized further, or is there some fast library for CSV loading (like Univocity parser in Java), or maybe just a dedicated NumPy function?
The source code of the test:
import perfplot
import csv
import numpy as np
import pandas as pd
def load_read_csv(path: str):
with open(path, 'r') as csv_file:
reader = csv.reader(csv_file)
matrix = None
first_row = True
for row_index, row in enumerate(reader):
if first_row:
size = len(row)
matrix = np.zeros((size, size), dtype=int)
first_row = False
# matrix[row_index] = [int(item) for item in row]
matrix[row_index] = row
return matrix
def load_loadtxt(path: str):
matrix = np.loadtxt(path, dtype=int, comments=None, delimiter=",", encoding="utf-8")
return matrix
def load_genfromtxt(path: str):
matrix = np.genfromtxt(path, dtype=int, comments=None, delimiter=",", deletechars=None, replace_space=None, encoding="utf-8")
return matrix
def load_pandas(path: str):
df = pd.read_csv(path, header=None, dtype=np.int32)
return df.values
def load_pandas_engine_pyarrow(path: str):
df = pd.read_csv(path, header=None, dtype=np.int32, engine='pyarrow')
return df.values
def load_pandas_engine_python(path: str):
df = pd.read_csv(path, header=None, dtype=np.int32, engine='python')
return df.values
def setup(n):
matrix = np.random.randint(0, 10000, size=(n, n), dtype=int)
filename = f"square_matrix_of_size_{n}.csv"
np.savetxt(filename, matrix, fmt="%d", delimiter=",")
return filename
b = perfplot.bench(
setup=setup, # or setup=np.random.rand
kernels=[
load_read_csv,
load_loadtxt,
load_genfromtxt,
load_pandas,
load_pandas_engine_pyarrow,
load_pandas_engine_python
],
n_range=[2 ** k for k in range(15)]
)
b.save("out.png")
b.show()