I am trying to download a number of random articles from an index (Let's say 5000). I tried to generate this program using multiprocessing, however the multi-processed version runs more than twice as slow as the regular version. Why is this?
Code attached:
import time
import bs4
import requests
import os
import urllib.parse
import re
import random
from multiprocessing import Process
from functools import partial
import multiprocessing as mp
list_of_files = os.listdir("..\\sample_set")
# Read index file
index = open("C:\\Users\\useradmin\\Desktop\\index.txt", "r", encoding="utf-8")
lines = index.readlines()
# Seperate random number list into two lists
my_randoms = random.sample(range(1, 18458000), 5000)
input = []
for num in my_randoms:
input.append(lines[num].split(":")[2].strip("\n"))
start_time = time.clock()
from multiprocessing import Pool
def job(title):
if not any(title in file for file in list_of_files):
# Download file using requests.get()
if __name__ == '__main__':
pool = Pool()
pool.map(job, input)
print(time.clock() - start_time, "seconds")
Thank you!