I would like to go trough a list that contains maybe 10.000 lines and do so with X amount of threads. I am wondering how I would tell the threads to take Line1 to LineX and when those threads are finished start a new thread from LineX and so on and so forth. Right now this code I wrote starts the amount of threads there is in my pagenumbers list.
Any hints or suggestions is appreciated, I am still learning Python! Thanks in advance.
import re
import random
import threading
import random
import string
import requests
import os
import time
from bs4 import BeautifulSoup
start = time.time()
dork = input('Input Search term: ')
pagenumbers = ["0", "10", "20", "30", "40", "50"]
def startscrape(search, page):
with requests.session() as req:
headersx = {'Host': 'www.url.com',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Sec-Fetch-User': '?1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
}
url = 'https://www.url.com/' + search + '&first=' + page
resp = req.get(url, headers=headersx)
soup = BeautifulSoup(resp.text, 'html.parser')
divfound = soup.findAll("div", {"class": "b_title"})
urlsfound = re.findall(r'href=\"(.*?)\">', str(divfound), re.S)
with open('urls.txt', 'a', errors='ignore') as f:
for urls in urlsfound:
if 'microsofttranslator' not in urls:
f.write(urls + "\n")
threads = [threading.Thread(target=startscrape, args=(dork, page)) for page in pagenumbers]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print("Elapsed Time: %s" % (time.time() - start))