If speed matters most, you'll want to cut out as much dynamism in your row-in-reader loop... at least I think... I haven't timed or profiled this... just going off your own analysis that DictReader was too slow, so...
What's the fastest way I could think of to deal with an arbitrary order of columns? Explicity name the columns you expect then get their row indices based on the header:
def fancy_iter(csv_fname:str) -> Generator[Fancy, None, None]:
reader = csv.reader(open(csv_fname))
header = next(reader)
# Hard-coded list of columns your team knows and maintains
idx_a = header.index("col_A")
idx_b = header.index("col_B")
idx_c = header.index("col_C")
for row in reader:
yield Fancy(
a=row[idx_a],
b=int(row[idx_b]),
c=RomanNumeral(row[idx_c]),
)
Otherwise, like you were, you'll be doing some kind of field-to-column mapping lookup inside the row-in-reader loop, and I see that really dragging down performance on millions of rows. But, again, that's just my intuition/speculation.
Here's my complete test program (and, I've changed your values/semantics around again to try and better illustrate the scope of the problem):
import csv
import io
from dataclasses import dataclass
from enum import Enum
from typing import Generator
class RomanNumeral(Enum):
I = "i"
II = "ii"
III = "iii"
IV = "iv"
V = "v"
@dataclass
class Fancy:
a: str
b: int
c: RomanNumeral
def __repr__(self) -> str:
a = f"a='{self.a}',"
b = f"b={self.b},"
c = f"c={self.c}"
return f"Fancy( {a:<10} {b:<4} {c:<18} )"
def fancy_iter(csv_file: io.TextIOBase) -> Generator[Fancy, None, None]:
reader = csv.reader(csv_file, skipinitialspace=True)
header = next(reader)
# Hard-coded list of columns your team knows and maintains
idx_a = header.index("col_A")
idx_b = header.index("col_B")
idx_c = header.index("col_C")
for row in reader:
if "" in row:
continue # the row is "incomplete"; don't try, just move on
yield Fancy(
a=row[idx_a],
b=int(row[idx_b]),
c=RomanNumeral(row[idx_c]),
)
def main():
# Two sets of "files" with different column orders, the last also has
# invalid rows
cst_strs = [
"""
col_A,col_B,col_C
one,1,i
four,4,iv
three,3,iii
""",
"""
col_C,col_A,col_B
i,one,1
ii,two,2
iii,three,3
iv,,4
v,five,
,,
,
""",
]
for i, csv_str in enumerate(cst_strs, start=1):
csv_file = io.StringIO(csv_str.strip())
csv_file = io.StringIO(csv_str.strip())
fancies = list(fancy_iter(csv_file))
print(f"{i}:")
for fancy in fancies:
print(f" {fancy}")
if __name__ == "__main__":
main()
1:
Fancy( a='one', b=1, c=RomanNumeral.I )
Fancy( a='four', b=4, c=RomanNumeral.IV )
Fancy( a='three', b=3, c=RomanNumeral.III )
2:
Fancy( a='one', b=1, c=RomanNumeral.I )
Fancy( a='two', b=2, c=RomanNumeral.II )
Fancy( a='three', b=3, c=RomanNumeral.III )
I also changed out the try/catch for an explicit if/continue, based on my understanding of this answer, Cost of exception handlers: the if-block is faster overall, and definitely faster in the expected case that rows will be invalid.
Your team will have to keep the fieldnames in the dataclass and the idx_
vars in the function in sync, but that's the tradeoff (as I see it) for more speed during runtime. Anyways, you appear to rely on type hints (and maybe a linter?), which will help catch mismatches. Some mismatches will at least result in a runtime error:
- if iter func falls out of sync with the CSV itself, the column mapping might fail (if the names changed, or the CSV is missing a column)
- if iter func falls out of sync with the dataclass, the Fancy() init will fail
And, you could just have a current unit test with the complete set of columns.
For fun
I came up with a scheme of having docstrings with column names below the fieldnames that match the column names in the iter func index mapping lines, like:
a: str
"""col_A"""
b: int
"""col_B"""
c: RomanNumeral
"""col_C"""
and came up with a "linter" (my first time trying to use the ast module) that ensures those docstrings match the string literals in the fancy iter func:
# Ensure that the fancy_iter() function "knows" the correct and
# complete mapping of CSV column names to the Fancy dataclass
# fieldnames.
import ast
import sys
MAIN_PY = "main2.py"
def get_dataclass_cols(dataclass_node: ast.ClassDef) -> set[str]:
"""
Look in the Fancy dataclass for pairs of lines of
fieldname-line and docstring-line (column name in CSV), like:
class Fancy:
a: str
'''col_A'''
b: int
'''col_B'''
and return a set of CSV column names, e.g., {'col_A', 'col_B'}
"""
_node = dataclass_node
cols: set[str] = set()
# Looking for pairs of AST objects like:
# AnnAssign( ... ) <-- an AnnAssign node
# ... followed by...
# Expr( <-- an Expr node
# value=Constant(value='col_A')) <-- w/a Constant w/a string value (the column name)
for i in range(len(_node.body)):
# Verify "lines" 1 & 2 are AnnAssign and Expr
node1 = _node.body[i]
if not isinstance(node1, ast.AnnAssign):
continue
node2 = _node.body[i + 1]
if not isinstance(node2, ast.Expr):
continue
expr = node2
# Verify Expr has string Constant
if not isinstance(expr.value, ast.Constant):
continue
const = expr.value
if not isinstance(const.value, str):
continue
cols.add(const.value)
return cols
def get_iterfunc_cols(func_node: ast.FunctionDef) -> set[str]:
"""
Look in the CSV iter func for lines assigning column names to indexes,
beginning with "idx_", like:
idx_a = header.index("col_A")
idx_b = header.index("col_B")
and return a set of CSV column names, e.g., {'col_A', 'col_B'}
"""
cols: set[str] = set()
# Looking for AST objects like:
# Assign( <-- an Assign node
# targets=[ <-- w/a target
# Name(id='idx_b', ctx=Store())], <-- w/a Name that starts with 'idx_'
# value=Call( <-- and a Call node...
# ...
# args=[ <-- w/an arg
# Constant(value='col_B') ], <-- w/a Constant w/a string value (the column name)
# )
for node in func_node.body:
# Verify Assign with correct Name
if not isinstance(node, ast.Assign):
continue
if len(node.targets) == 0:
continue
target = node.targets[0]
if not isinstance(target, ast.Name):
continue
name = target
if not name.id.startswith("idx_"):
continue
if not isinstance(node.value, ast.Call):
continue
# Verify Call with correct string Constant
call = node.value
if len(call.args) == 0:
continue
arg = call.args[0]
if not isinstance(arg, ast.Constant):
continue
const = arg
if not isinstance(const.value, str):
continue
cols.add(const.value)
return cols
def error(msg: str):
print("Error, " + msg, file=sys.stderr)
sys.exit(1)
def main():
iterfunc_cols: set[str] = set()
dataclass_cols: set[str] = set()
main_body = ast.parse(open(MAIN_PY).read()).body
for node in main_body:
if isinstance(node, ast.FunctionDef) and node.name == "fancy_iter":
iterfunc_cols = get_iterfunc_cols(node)
if isinstance(node, ast.ClassDef) and node.name == "Fancy":
dataclass_cols = get_dataclass_cols(node)
if len(dataclass_cols) == 0:
error("did not find any columns in the dataclass")
if len(iterfunc_cols) == 0:
error("did not find any columns in the iter func")
if iterfunc_cols != dataclass_cols:
err_msg = "\n".join(
[
"columns do not match:",
" dataclass_cols: %s" % sorted(dataclass_cols),
" iterfunc_cols: %s" % sorted(iterfunc_cols),
]
)
error(err_msg)
if __name__ == "__main__":
main()
So long as your dataclass and iter func are in sync:
a: str
"""col_A""" idx_a = header.index("col_A")
b: int
"""col_B""" idx_b = header.index("col_B")
c: RomanNumeral
"""col_C""" idx_c = header.index("col_C")
the linter is happy. But soon as the linter doesn't find any columns in either the dataclass or the iter func, or it sees the two are out of sync:
a: str
"""col_A""" idx_a = header.index("col_a")
b: int
"""col_B""" idx_b = header.index("col_B")
c: RomanNumeral
"""col_C""" idx_c = header.index("col_C")
Error, columns do not match:
dataclass_cols: ['col_A', 'col_B', 'col_C']
iterfunc_cols: ['col_B', 'col_C', 'col_a']
or
a: str
"""col_A""" idx_a = header.index("col_A")
b: int
"""col_B""" idx_b = header.index("col_B")
c: RomanNumeral
"""col_C""" idx_c = header.index("col_C")
d: float
"""col_D"""
Error, columns do not match:
dataclass_cols: ['col_A', 'col_B', 'col_C', 'col_D']
iterfunc_cols: ['col_A', 'col_B', 'col_C']
Either of those errors would raise exceptions at runtime:
idx_a = header.index("col_a")
^^^^^^^^^^^^^^^^^^^^^
ValueError: 'col_a' is not in list
or:
yield Fancy(
^^^^^^
TypeError: Fancy.__init__() missing 1 required positional argument: 'd'