Using a feature of Unix systems called forking, you can read (not write) data from the parent process with zero overhead. Normally, you would have to copy the data over, but forking a process in Unix allows you to circumvent this.
Using this, the job in the pool can access the whole input string and extract the part that it will work on. It can then split and parse this section of the string on its own and return the sum of the integers in its section.
from multiprocessing import Pool, cpu_count
from time import time
def serial(data):
return sum(map(int, data.split()))
def parallel(data):
processes = cpu_count()
with Pool(processes) as pool:
args = zip(
["input_"] * processes, # name of global to access
range(processes), # job number
[processes] * processes # total number of jobs
)
return sum(pool.map(job, args, chunksize=1))
def job(args):
global_name, job_number, total_jobs = args
data = globals()[global_name]
chunk = get_chunk(data, job_number, total_jobs)
return serial(chunk)
def get_chunk(string, job_number, total_jobs):
"""This function may mess up if the number of integers in each chunk is low (1-2).
It also assumes there is only 1 space separating integers."""
approx_chunk_size = len(string) // total_jobs
# initial estimates
start = approx_chunk_size * job_number
end = start + approx_chunk_size
if start and not string.startswith(" ", start - 1):
# if string[start] is not beginning of a number, advance to start of next number
start = string.index(" ", start) + 1
if job_number == total_jobs:
# last job
end = None
elif not string.startswith(" ", end - 1):
# if string[end] is part of a number, then advance to end of number
end = string.index(" ", end - 1)
return string[start:end]
def timeit(func, *args, **kwargs):
"Simple timing function"
start = time()
result = func(*args, **kwargs)
end = time()
print("{} took {} seconds".format(func.__name__, end - start))
return result
if __name__ == "__main__":
# from multiprocessing.dummy import Pool # uncomment this for testing
input_ = "1000000000 " * int(1e6)
actual = timeit(parallel, input_)
expected = timeit(serial, input_)
assert actual == expected