There are five categories, and each category contains multiple TXT files. I need to count the number of occurrences of each word in each file. I want to improve the efficiency of the program through the coroutine. However, the comparison shows that the program slows down after the use of the coroutine. I don't know why. Did I use the coroutine in a wrong way?
Here is the code:
import os
import re
import time
from collections import Counter
import gevent
def count_words(content):
words = [word.strip('"') for word in re.split("[\s-]", content) if re.fullmatch("[a-zA-Z']+", word.strip('"'))]
word_count = Counter(words)
return word_count
def process_single_document(folder_name, document_path):
with open(document_path, mode='r+', encoding='utf8') as f:
content = f.read()
word_count = count_words(content)
content_append = '\n'.join([f'{word} {count}' for word, count in sorted(word_count.items(), key=lambda pair: pair[0])])
f.write(content_append)
file_name = f'{folder_name}_{os.path.basename(document_path)}'
return {file_name: sum(word_count.values())}
@timecost
def process_single_folder(folder_path):
folder_name = os.path.basename(folder_path)
folder_res = {}
for file in os.listdir(folder_path):
file_path = os.path.join(folder_path, file)
folder_res.update(process_single_document(folder_name, file_path))
print(f'file counts: {len(folder_res)}')
return folder_res
@timecost
def process_single_folder_coroutine(folder_path):
folder_name = os.path.basename(folder_path)
result = []
for file in os.listdir(folder_path):
file_path = os.path.join(folder_path, file)
res = gevent.spawn(process_single_document, folder_name, file_path)
result.append(res)
gevent.joinall(result)
folder_res = {}
for res in result:
folder_res.update(res.value)
print(f'file counts: {len(folder_res)}')
return folder_res
Here is the result:
--- type: athletics ---
file counts: 101
process_single_folder cost time: 7.486446142196655
process_single_folder_coroutine cost time: 8.679980754852295
--- type: cricket ---
file counts: 124
process_single_folder cost time: 10.89174485206604
process_single_folder_coroutine cost time: 11.055976390838623
--- type: football ---
file counts: 265
process_single_folder cost time: 20.97000026702881
process_single_folder_coroutine cost time: 20.88500189781189
--- type: rugby ---
file counts: 147
process_single_folder cost time: 12.401747703552246
process_single_folder_coroutine cost time: 12.073007106781006
--- type: tennis ---
file counts: 100
process_single_folder cost time: 6.553194522857666
process_single_folder_coroutine cost time: 6.651681184768677