I'm now web-scraping about 10,000 products from a retail website (e.g., Amazon) everyday to keep track of price history.
Scraping consists of two parts; firstly, I collect the "listings" of products where I get the basic information, such as product name, price, id, and url of each listing and save that to pandas dataframe. Secondly, using each url of product, I collect more detailed information about product and attach that one-by-one into the other columns of dataframe. I have no issue in the first part (takes <10 minutes), but it usually takes more than 15 hours to complete the second part.
Below is a sample code. Although the sample code below is not real, the actual code is just prolonged version of it.
import requests
import pandas as pd
import json
from user_agent import generate_user_agent
df_dic = {
"product_name": ['product1','product2','product3','product4','product5'],
"product_price": ['500','800','300','700','1000'],
"product_id": ['1000','1001','1002','1003','1004'],
"product_url": ['url1','url2','url3','url4','url5'],
}
# df is the data scraped from the first part
df = pd.DataFrame(df_dic)
df['product_chracter1'] = ""
df['product_chracter2'] = ""
df['product_chracter3'] = ""
df['product_chracter4'] = ""
df['product_chracter5'] = ""
df['product_chracter6'] = ""
df['product_chracter7'] = ""
df['product_chracter8'] = ""
df['product_chracter9'] = ""
df['product_chracter10'] = ""
# Below is the beginning of the second part where detailed product characteristics (more than 50) are attached to dataframe
for i_url in df['product_url']:
try:
product_id = df['product_id'].loc[df['prouct_url']==i_url]
params = {'productSeq': product_id}
headers = {'User-Agent': generate_user_agent(device_type='smartphone', navigator='chrome')}
baseline_url = r'https://www.something.com'
html = requests.post(baseline_url, headers = headers, params = params).text
time.sleep(0.3)
df['product_chracter1'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter1 = "(.+?)";', html)[0]
df['product_chracter2'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter2 = "(.+?)";', html)[0]
df['product_chracter3'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter3 = "(.+?)";', html)[0]
df['product_chracter4'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter4 = "(.+?)";', html)[0]
df['product_chracter5'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter5 = "(.+?)";', html)[0]
baseline_url_2 = r'https://www.something_2.com'
html = requests.post(baseline_url_2, headers = headers, params = params).text
time.sleep(0.3)
df['product_chracter6'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter6 = "(.+?)";', html)[0]
df['product_chracter7'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter7 = "(.+?)";', html)[0]
df['product_chracter8'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter8 = "(.+?)";', html)[0]
df['product_chracter9'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter9 = "(.+?)";', html)[0]
df['product_chracter10'].loc[df['prouct_url']==i_url] = re.findall(r'var product_chracter10 = "(.+?)";', html)[0]
except Exception as e:
print(f"Some error happened at {i_url}")
print(e)
continue
filename = 'site_date'
df.to_pickle(f'{filename}.pkl')
I used try
and except
as there exist some products that are sold while scraping or do not show some characteristics, thus, causes error. I'm only using requests.get
and requests.post
, not selenium
. My question is, how I can use multiprocessing
in python and make my code run faster? I read that ThreadPoolExecutor
from the concurrent.futures
library can help, but really do not have any knowledge how I can implement that in my case.
Any help, different approach, and any comments would be greatly appreciated. Thank you for your time and consideration for reading this.
Updated: 2022-07-02
Thanks to @GohKohHan's comment, I was able to apply multiprocessing into my code and it takes almost 1/max_workers
of the time run with the previous code. I know my code is not perfect and a lot of people here are super masters at python
, so I hope you to point out any mistakes or things that need to be improved from the following code.
import requests,re
import time
import pandas as pd
from user_agent import generate_user_agent
import concurrent.futures
start_time = time.time
urls = df['product_url']
def return_dataframe(i_url):
try:
unique = []
product_chracter1=[]; product_chracter6=[]
product_chracter2=[]; product_chracter7=[]
product_chracter3=[]; product_chracter8=[]
product_chracter4=[]; product_chracter9=[]
product_chracter5=[]; product_chracter10=[]
product_id = df['product_id'].loc[df['prouct_url']==i_url]
params = {'productSeq': product_id}
headers = {'User-Agent':generate_user_agent(device_type='smartphone', navigator='chrome')}
unique.append(i_url)
baseline_url = r'https://www.something.com'
html = requests.post(baseline_url, headers = headers, params = params).text
time.sleep(0.3)
product_chracter1.append(re.findall(r'var product_chracter1 = "(.+?)";', html)[0])
product_chracter2.append(re.findall(r'var product_chracter2 = "(.+?)";', html)[0])
product_chracter3.append(re.findall(r'var product_chracter3 = "(.+?)";', html)[0])
product_chracter4.append(re.findall(r'var product_chracter4 = "(.+?)";', html)[0])
product_chracter5.append(re.findall(r'var product_chracter5 = "(.+?)";', html)[0])
baseline_url_2 = r'https://www.something_2.com'
html = requests.post(baseline_url_2, headers = headers, params = params).text
time.sleep(0.3)
product_chracter6.append(re.findall(r'var product_chracter6 = "(.+?)";', html)[0])
product_chracter7.append(re.findall(r'var product_chracter7 = "(.+?)";', html)[0])
product_chracter8.append(re.findall(r'var product_chracter8 = "(.+?)";', html)[0])
product_chracter9.append(re.findall(r'var product_chracter9 = "(.+?)";', html)[0])
product_chracter10.append(re.findall(r'var product_chracter10 = "(.+?)";', html)[0])
i_df_detailed = pd.DataFrame(list(zip(unique, product_chracter1, product_chracter2, product_chracter3,
product_chracter4, product_chracter5, product_chracter6,
product_chracter7, product_chracter8, product_chracter9,
product_chracter10)),
columns = ['unique', 'product_chracter1', 'product_chracter2', 'product_chracter3',
'product_chracter4', 'product_chracter5', 'product_chracter6',
'product_chracter7', 'product_chracter8', 'product_chracter9',
'product_chracter10'])
return i_df_detailed
except Exception as e:
print(f"Some error happened at {i_url}")
print(e)
continue
detailed_agg = []
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
future_to_url = {executor.submit(return_dataframe, i_url): i_url for i_url in urls}
for future in concurrent.futures.as_completed(future_to_url):
try:
detailed_agg.append(future.result())
except Exception as exc:
print('generated an exception: %s' % (exc))
df_detailed = pd.concat(detailed_agg)
df_agg = pd.merge(df, df_detailed, how='left', left_on=['product_url'], right_on=['unique'])
# How long did it take?
print("--- %.1f minutes ---" % (int(time.time() - start_time)/60))
Any suggestions would be greatly appreciated!