I'm trying to read a 400 MB text file and I'm reading it in chunks as I need to extract the words from it and I'm trying to use thread pool but it running longer than I ran as a single process.
Below are the two functions,
def process_text(self): # processing the text file
# if valid path then enter into try statement
try:
with open(self.path, 'r') as f:
pool = ThreadPool(20) # creating a thread pool of size 20
print("Processing text file ...")
while True:
data = list(islice(f, 100)) # slicing input file in size of 100 lines
if not data: break
pool.map(self.word_counting, data) #calling word_counting function which is extracting the words and storing the words in a dictionary
pool.close()
pool.join()
def word_counting(self, cur_list):
for line in cur_list:
for word in re.findall(r'\w{2,}', line):# will check for word of length greater than 1 i.e >1
self.word_dic[word] = self.word_dic.get(word, 0) + 1
Can Anyone Help with that?