Im running image processing on a huge dataset with multiprocessing and Im wondering if running ThreadPoolExecutor inside a Pool provides any benefit vs just simply running Pool on all items.
The dataset contains multiple folders with each folder containing images, so my initial though was to split up each folder in to a process and each image in that folder to a thread. Other way would be to just get every image and run that as a process.
for instance, each folder as a process and each image as a thread
from concurrent import futures
from multiprocessing import Pool
from pathlib import Path
def handle_image(image_path: Path):
pass
def handle_folder(folder_path: Path):
with futures.ThreadPoolExecutor() as e:
e.map(handle_image, folder_path.glob("*"))
e.shutdown()
if __name__ == '__main__':
dataset_folder = Path("Folder")
with Pool() as p:
p.imap_unordered(handle_folder, dataset_folder.iterdir())
p.close()
p.join()
versus each image as a process
from multiprocessing import Pool
from pathlib import Path
def handle_image(image_path: Path):
if not image_path.is_file():
return
if __name__ == '__main__':
dataset_folder = Path("Folder")
with Pool() as p:
p.imap_unordered(handle_image, dataset_folder.glob("**/*"), 100)
p.close()
p.join()