An iterator producing the integers in the file:
from itertools import chain, groupby
from functools import partial
def numbers(file):
chunks = iter(partial(file.read, 1024), '')
chars = chain.from_iterable(chunks)
for isspace, group in groupby(chars, str.isspace):
if not isspace:
yield int(''.join(group))
Splits by whitespace, so also does negative numbers.
Benchmark with shuffled range(10 ** 6)
:
2.03 seconds numbers_Johnny_Mopp
0.99 seconds numbers_Kelly_Bundy
2.06 seconds numbers_Johnny_Mopp
0.92 seconds numbers_Kelly_Bundy
2.04 seconds numbers_Johnny_Mopp
0.95 seconds numbers_Kelly_Bundy
Full code with sample data creator, correctness check, and benchmark:
from timeit import timeit
from random import randint, choices, shuffle
from itertools import chain, groupby
from functools import partial
from collections import deque
# Create test data
numbers = list(range(10 ** 6))
shuffle(numbers)
with open('test.txt', 'w') as f:
for number in numbers:
print(number, end=''.join(choices(' \t\n', k=randint(1, 2))), file=f)
def numbers_Kelly_Bundy(file):
chunks = iter(partial(file.read, 1024), '')
chars = chain.from_iterable(chunks)
for isspace, group in groupby(chars, str.isspace):
if not isspace:
yield int(''.join(group))
def numbers_Johnny_Mopp(file):
def file_buf_gen(f, bytes_per_read=1024):
while buffer := f.read(bytes_per_read):
yield from buffer
# Yields all numbers in a file. Ignores anything that is not '0'-'9'
def read_numbers(file):
num = None
for b in file_buf_gen(file):
if b.isnumeric():
num = num * 10 + int(b) if num != None else int(b)
elif num != None:
yield num
num = None
if num != None:
yield num
return read_numbers(file)
funcs = numbers_Johnny_Mopp, numbers_Kelly_Bundy
# Correctness check
for func in funcs:
with open('test.txt') as file:
print(list(func(file)) == numbers)
# Speed tests
for _ in range(3):
for func in funcs:
with open('test.txt') as file:
t = timeit(lambda: deque(func(file), 0), number=1)
print('%4.2f seconds ' % t, func.__name__)
print()