EDIT: Instead of do-it-yourself chunking, it's better to use the chunking feature of pandas, which is much, much faster than numpy's load_txt
.
import numpy as np
import pandas as pd
## create csv file for testing
np.random.seed(1)
nrows, ncols = 100000, 4
data = np.random.uniform(size=(nrows, ncols))
np.savetxt('bigdata.csv', data, delimiter=',')
## read it back
chunk_rows = 12345
# Replace np.empty by np.memmap array for large datasets.
odata = np.empty((nrows, ncols), dtype=np.float32)
oindex = 0
chunks = pd.read_csv('bigdata.csv', chunksize=chunk_rows,
names=['a', 'b', 'c', 'd'])
for chunk in chunks:
m, _ = chunk.shape
odata[oindex:oindex+m, :] = chunk
oindex += m
# check that it worked correctly.
assert np.allclose(data, odata, atol=1e-7)
The pd.read_csv
function in chunked mode returns a special object that can be used in a loop such as for chunk in chunks:
; at every iteration, it will read a chunk of the file and return its contents as a pandas DataFrame
, which can be treated as a numpy array in this case. The parameter names
is needed to prevent it from treating the first line of the csv file as column names.
Old answer below
The numpy.loadtxt
function works with a filename or something that will return lines in a loop in a construct such as:
for line in f:
do_something()
It doesn't even need to pretend to be a file; a list of strings will do!
We can read chunks of the file that are small enough to fit in memory and provide batches of lines to np.loadtxt
.
def get_file_lines(fname, seek, maxlen):
"""Read lines from a section of a file.
Parameters:
- fname: filename
- seek: start position in the file
- maxlen: maximum length (bytes) to read
Return:
- lines: list of lines (only entire lines).
- seek_end: seek position at end of this chunk.
Reference: https://stackoverflow.com/a/63043614/6228891
Copying: any of CC-BY-SA, CC-BY, GPL, BSD, LPGL
Author: Han-Kwang Nienhuys
"""
f = open(fname, 'rb') # binary for Windows \r\n line endings
f.seek(seek)
buf = f.read(maxlen)
n = len(buf)
if n == 0:
return [], seek
# find a newline near the end
for i in range(min(10000, n)):
if buf[-i] == 0x0a:
# newline
buflen = n - i + 1
lines = buf[:buflen].decode('utf-8').split('\n')
seek_end = seek + buflen
return lines, seek_end
else:
raise ValueError('Could not find end of line')
import numpy as np
## create csv file for testing
np.random.seed(1)
nrows, ncols = 10000, 4
data = np.random.uniform(size=(nrows, ncols))
np.savetxt('bigdata.csv', data, delimiter=',')
# read it back
fpos = 0
chunksize = 456 # Small value for testing; make this big (megabytes).
# we will store the data here. Replace by memmap array if necessary.
odata = np.empty((nrows, ncols), dtype=np.float32)
oindex = 0
while True:
lines, fpos = get_file_lines('bigdata.csv', fpos, chunksize)
if not lines:
# end of file
break
rdata = np.loadtxt(lines, delimiter=',')
m, _ = rdata.shape
odata[oindex:oindex+m, :] = rdata
oindex += m
assert np.allclose(data, odata, atol=1e-7)
Disclaimer: I tested this in Linux. I expect this to work in Windows, but it could be that the handling of '\r' characters causes problems.