I'm trying to multiprocess one of my scripts. The script reads all data from file, and write to another file the relevant lines (for example, all students who's first name is Jacob).
The original script was:
search_val = "jacob"
with open("big_file.txt") as f:
with open("matches.txt", "w") as f_out:
for line in f:
if (search_val in line.lower()):
f_out.write(line)
This script and the multiprocess
both generate good results for small files, but original script also works on bib files.
The multiprocess script is:
from multiprocessing import Pool
import threading
import time
import Queue
import sys
search_val = {"key1" : [], "key2" : [], "key3":[]}
def process_line(line):
global search_val
key_val_list = []
for key in search_val.keys():
if (key.lower() in line.lower()):
search_val[key].append(line.strip())
key_val_list.append({key:line})
return key_val_list
#with open("big_file.txt") as f:
def get_lines():
with open("small_file.txt") as f:
yield f
if __name__ == "__main__":
pool = Pool(8)
file_lines = get_lines()
start = time. time()
end = time. time()
#print(end - start)
results = pool.map(process_line, next(file_lines), 8)
#pool.close()
#print(results)
print("Done reading")
end = time. time()
print(end - start)
with open("results.txt", "w") as f_out:
f_out.write(str(results))
print("Done saving results")
end = time. time()
print(end - start)
print_dict= {}
for line in results:
for result in line:
for key in result.keys():
if key in print_dict.keys():
print_dict[key].append(result[key].strip())
else:
print_dict[key] = [result[key].strip()]
print("Done ordering")
end = time. time()
print(end - start)
for key in print_dict.keys():
with open(key+".txt", "w") as f_out:
for val in print_dict[key]:
f_out.write(val + "\n")
you can use this as small file:
key2@key1.co.il
key3@key1.net
key4@key1.co.uk
key5@key1.co.il
This script works fine for small files, but doesn't generate any results (doesn't even prints "Done reading") for the big_file. Big_file size is 11 GB.
I have 2 questions:
- Did I use yield the way I suppose to?
- Do you have any Idea why it doesn't work?
I also tried to update search_val map (which is global parameter), but that didn't work either, so I tried the list option.
If you have any ideas you are more than welcome to share.