My main objective is to scrape as many profile links as possible on Khan Academy. And then scrape some specific data on each of these profiles.
My goal with this question is to use threading to make my script work much faster.
So I will present my code in two part: first part without threading an second part with threading.
This the original code without threading:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,StaleElementReferenceException
from bs4 import BeautifulSoup
import re
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://www.khanacademy.org/computing/computer-programming/programming#intro-to-programming')
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
#find course steps links
courses_links = soup.find_all(class_='link_1uvuyao-o_O-nodeStyle_cu2reh-o_O-nodeStyleIcon_4udnki')
list_courses={}
for links in courses_links:
courses = links.extract()
link_course = courses['href']
title_course= links.find(class_='nodeTitle_145jbuf')
span_title_course=title_course.span
text_span=span_title_course.text.strip()
final_link_course ='https://www.khanacademy.org'+link_course
list_courses[text_span]=final_link_course
#print(list_courses)
# my goal is to loop the below script with each "course link" that I got above with list_courses
for courses_step in list_courses.values():
driver = webdriver.Chrome()
driver.get(courses_step)
while True:
try:
showmore=WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME,'button_1eqj1ga-o_O-shared_1t8r4tr-o_O-default_9fm203')))
showmore.click()
except TimeoutException:
break
except StaleElementReferenceException:
break
soup=BeautifulSoup(driver.page_source,'html.parser')
#find the profile links
profiles = soup.find_all(href=re.compile("/profile/kaid"))
profile_list=[]
for links in profiles:
links_no_list = links.extract()
text_link = links_no_list['href']
text_link_nodiscussion = text_link[:-10]
final_profile_link ='https://www.khanacademy.org'+text_link_nodiscussion
profile_list.append(final_profile_link)
#remove duplicates
profile_list=list(set(profile_list))
#print number of profiles we got
print('in this link:')
print(courses_step)
print('we have this number of profiles:')
print(len(profile_list))
#create the csv file
filename = "khanscrapetry1.csv"
f = open(filename, "w")
headers = "link, date_joined, points, videos, questions, votes, answers, flags, project_request, project_replies, comments, tips_thx, last_date\n"
f.write(headers)
#for each profile link, scrape the specific data and store them into the csv
for link in profile_list:
#to avoid Scraping same profile multiple times
#print each profile link we are about to scrape
print("Scraping ",link)
driver.get(link)
#wait for content to load
#if profile does not exist skip
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH ,'//*[@id="widget-list"]/div[1]/div[1]')))
except TimeoutException:
continue
soup=BeautifulSoup(driver.page_source,'html.parser')
user_info_table=soup.find('table', class_='user-statistics-table')
if user_info_table is not None:
dates,points,videos=[tr.find_all('td')[1].text for tr in user_info_table.find_all('tr')]
else:
dates=points=videos='NA'
user_socio_table=soup.find_all('div', class_='discussion-stat')
data = {}
for gettext in user_socio_table:
category = gettext.find('span')
category_text = category.text.strip()
number = category.previousSibling.strip()
data[category_text] = number
full_data_keys=['questions','votes','answers','flags raised','project help requests','project help replies','comments','tips and thanks'] #might change answers to answer because when it's 1 it's putting NA instead
for header_value in full_data_keys:
if header_value not in data.keys():
data[header_value]='NA'
user_calendar = soup.find('div',class_='streak-calendar-scroll-container')
if user_calendar is not None:
last_activity = user_calendar.find('span',class_='streak-cell filled')
try:
last_activity_date = last_activity['title']
except TypeError:
last_activity_date='NA'
else:
last_activity_date='NA'
f.write(link + "," + dates + "," + points.replace("," , "") + "," + videos + "," + data['questions'] + "," + data['votes'] + "," + data['answers'] + "," + data['flags raised'] + "," + data['project help requests'] + "," + data['project help replies'] + "," + data['comments'] + "," + data['tips and thanks'] + "," + last_activity_date + "\n")
This code should work fine. But the problem is: it's taking way too much time.
And here is the script that include threading:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,StaleElementReferenceException
from bs4 import BeautifulSoup
import re
from requests_html import HTMLSession
import concurrent.futures
session = HTMLSession()
r = session.get('https://www.khanacademy.org/computing/computer-programming/programming#intro-to-programming')
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
#find course steps links
courses_links = soup.find_all(class_='link_1uvuyao-o_O-nodeStyle_cu2reh-o_O-nodeStyleIcon_4udnki')
list_courses={}
for links in courses_links:
courses = links.extract()
link_course = courses['href']
title_course= links.find(class_='nodeTitle_145jbuf')
span_title_course=title_course.span
text_span=span_title_course.text.strip()
final_link_course ='https://www.khanacademy.org'+link_course
list_courses[text_span]=final_link_course
#that's my driver function
def showmore(url, timeout):
driver = webdriver.Chrome()
driver.get(url)
while True:
try:
showmore=WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.CLASS_NAME,'button_1eqj1ga-o_O-shared_1t8r4tr-o_O-default_9fm203')))
showmore.click()
except TimeoutException:
break
except StaleElementReferenceException:
break
#that's my pool
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
#do this in my pool
future_to_url = {executor.submit(showmore, url, 20): url for url in list_courses.values()}
As you can see the second script is not doing everything yet. I still have to add the whole data scraping / writing process.
My question is: How to create threadings for the scrape and write parts? How should I order these threadings?
More broadly: How to make my script run as fast as possible?