I have a lot of web scraping to do so I switched to a headless browser hoping that would make things faster, but it didn't improve the speed by much.
I looked at this stack overflow post but I don't understand the answer someone wrote Is Selenium slow, or is my code wrong?
here is my slow code:
# followed this tutorial https://medium.com/@stevennatera/web-scraping-with-selenium-and-chrome-canary-on-macos-fc2eff723f9e
from selenium import webdriver
options = webdriver.ChromeOptions()
options.binary_location = '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary'
options.add_argument('window-size=800x841')
options.add_argument('headless')
driver = webdriver.Chrome(chrome_options=options)
driver.get('https://poshmark.com/search?')
xpath='//input[@id="user-search-box"]'
searchBox=driver.find_element_by_xpath(xpath)
brand="anthropology"
style="headband"
searchBox.send_keys(' '.join([brand,style]))
from selenium.webdriver.common.keys import Keys
#EQUIValent of hitting enter key
searchBox.send_keys(Keys.ENTER)
url=driver.current_url
print(url)
import requests
response=requests.get(url)
print(response)
print(response.text)
# using beautiful soup to grab the listins:
#______________________________
#print(response)
html=response.content
from bs4 import BeautifulSoup
from urllib.parse import urljoin
#print(html)
soup=BeautifulSoup(html,'html.parser')
#'a' as in links or anchore tags
anchore_tags=soup.find_all('a')
#print(x)
# finding the hyper links
#href is the hyperlink
hyper_links=[link.get("href") for link in soup.find_all("a")]
#print(hyper_links)
#(Better visual link this )
#href is the hyperlink
# for link in soup.find_all("a"):
#
# print(link.get("href"))
clothing_listings=set([listing for listing in hyper_links if listing and "listing" in listing]) # if the element and the word listing is in the element (becuase there could be a hyperlink that is NONE whcich is why we need the and )
# turning the list into a set because some of them are repeated
print(len(clothing_listings))
print(set(clothing_listings))
print(len(set(clothing_listings)))
#for somereason a link that is called unlike is showing up so im geting rid of those
clothing_listings=set([listing for listing in hyper_links if listing and "unlike" in listing]) # if the element and the word listing is in the element (becuase there could be a hyperlink that is NONE whcich is why we need the and )
print(len(clothing_listings))# this is the correct size of the amount of clothing items by that search
driver.quit()
Why is it taking so long to scrape things?