I'm currently using Windows 10 and Python 3.7 and I've been reading about how to scrape without opening up 1 Firefox browser window for each URLs that's being scraped in the urls
list. The code below is throwing an error back and I'm sure it's got something to do with how PhantomJS is being implemented, I just don't know what specifically.
I've read that PhantomJS was a solution when used with Selenium. I installed PJS, set up the Path on my computer and it appears to be running however I'm not completely sure how to implement it in the code itself.
The driver = webdriver.PhantomJS(executable_path=r"C:\phantomjs")
line is the line attempting to run the PJS. The code worked just fine before using driver = webdriver.Firefox()
.
urls = ["https://www.guitarcenter.com/Used/Bass.gc#pageName=used-page&N=18171+1076&Nao=0&recsPerPage=90&postalCode=02494&radius=100&profileCountryCode=US&profileCurrencyCode=USD","https://www.guitarcenter.com/Used/Bass.gc#pageName=used-page&N=18171+1076&Nao=90&recsPerPage=90&postalCode=02494&radius=100&profileCountryCode=US&profileCurrencyCode=USD","https://www.guitarcenter.com/Used/Bass.gc#pageName=used-page&N=18171+1076&Nao=180&recsPerPage=90&postalCode=02494&radius=100&profileCountryCode=US&profileCurrencyCode=USD","https://www.guitarcenter.com/Used/Bass.gc#pageName=used-page&N=18171+1076&Nao=270&recsPerPage=90&postalCode=02494&radius=100&profileCountryCode=US&profileCurrencyCode=USD","https://www.guitarcenter.com/Used/Bass.gc#pageName=used-page&N=18171+1076&Nao=360&recsPerPage=90&postalCode=02494&radius=100&profileCountryCode=US&profileCurrencyCode=USD","https://www.guitarcenter.com/Used/Bass.gc#pageName=used-page&N=18171+1076&Nao=450&recsPerPage=90&postalCode=02494&radius=100&profileCountryCode=US&profileCurrencyCode=USD","https://www.guitarcenter.com/Used/Bass.gc#pageName=used-page&N=18171+1076&Nao=540&recsPerPage=90&postalCode=02494&radius=100&profileCountryCode=US&profileCurrencyCode=USD","https://www.guitarcenter.com/Used/Bass.gc#pageName=used-page&N=18171+1076&Nao=630&recsPerPage=90&postalCode=02494&radius=100&profileCountryCode=US&profileCurrencyCode=USD","https://www.guitarcenter.com/Used/Bass.gc#pageName=used-page&N=18171+1076&Nao=720&recsPerPage=90&postalCode=02494&radius=100&profileCountryCode=US&profileCurrencyCode=USD","https://www.guitarcenter.com/Used/Bass.gc#pageName=used-page&N=18171+1076&Nao=810&recsPerPage=90&postalCode=02494&radius=100&profileCountryCode=US&profileCurrencyCode=USD","https://www.guitarcenter.com/Used/Bass.gc#pageName=used-page&N=18171+1076&Nao=900&recsPerPage=90&postalCode=02494&radius=100&profileCountryCode=US&profileCurrencyCode=USD","https://www.guitarcenter.com/Used/Bass.gc#pageName=used-page&N=18171+1076&Nao=990&recsPerPage=90&postalCode=02494&radius=100&profileCountryCode=US&profileCurrencyCode=USD"]
#url = "https://www.guitarcenter.com/Used/Bass.gc#pageName=used-page&N=18171+1076&Nao=180&recsPerPage=90&postalCode=02494&radius=100&profileCountryCode=US&profileCurrencyCode=USD"
user_agent = UserAgent()
#make csv file
csv_file = open("gcscrape.csv", "w", newline='') #added the newline thing on 5.17.20 to try to stop blank lines from writing
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["bass_name","bass_price"])
for url in urls:
web_r = requests.get(url)
web_soup = BeautifulSoup(web_r.text,"html.parser")
#print(web_soup.findAll("li", class_="product-container")) #finding all of the grid items on the url above - price, photo, image, details and all
#print(len(web_soup.findAll("li", class_="product-container"))) #printing out the length of the
#driver = webdriver.Firefox()
driver = webdriver.PhantomJS(executable_path=r"C:\phantomjs")
driver.get(url)
html = driver.execute_script("return document.documentElement.outerHTML") #whats inside of this is a javascript call to get the outer html content of the page
sel_soup = BeautifulSoup(html, "html.parser")
for content in sel_soup.findAll("li", class_="product-container"):
#print(content)
bass_name = content.find("div", class_="productTitle").text.strip() #pulls the bass guitar name
print(bass_name)
prices_new = []
for i in content.find("span", class_="productPrice").text.split("$"):
prices_new.append(i.strip())
bp = prices_new[1]
print(bp)
#write row to new csv file
csv_writer.writerow([bass_name, bp])