I want to scrape the price from a webpage. Firstly, I have written the code of the price by block before I merge it altogether in one code. It works well when I written it by block. (especially for the price part on using .text.strip()
!pip install selenium
from selenium import webdriver
import time
from bs4 import BeautifulSoup
driver = webdriver.Chrome('D:\chromedriver.exe')
url = "https://www.fashionvalet.com/catalogsearch/result/?q=duck"
driver.get(url)
driver.maximize_window()
time.sleep(3)
btn = driver.find_element_by_xpath('/html/body/main/div/header/div[5]/div[1]/div[1]/div')
btn.click()
time.sleep(5)
soup = BeautifulSoup(driver.page_source, "html.parser")
p_price = card.select_one('.fvPLPProductPrice > strong').text.strip()
#"strong").select_one("strong").text.strip()
print(p_price)
MYR50.00
Unfortunately, when I merge all the code, the error come from the .text.strip()
on the price part,
!pip install selenium
from selenium import webdriver
import time
import pandas as pd
from bs4 import BeautifulSoup
def get_url(product_name):
product_name = product_name.replace(' ', '+')
url_template = "https://www.fashionvalet.com/catalogsearch/result/?q={}"
url = url_template.format(product_name)
return url
def product_info(card):
# name
p_name = card.find('h3').text.strip()
# price
#p_rice = card.find("p", "fvPLPProductPrice").select("strong")
p_price = card.select_one('.fvPLPProductPrice > strong').text.strip()
# image
p_image = card.find('img')
p_img = p_image['src']
# brand
p_brand = card.find('p', "fvPLPProductBrand").text.strip()
# discount percent
p_dis = card.find('p', "fvPLPProductMeta").text.strip()
info = (p_name, p_price, p_img, p_brand, p_dis)
return info
def main(product):
records = []
url = get_url(product) # 1--generate URL
driver = webdriver.Chrome('D:\chromedriver.exe') # 2--open browser
driver.get(url) # 3--open URL
driver.maximize_window()
time.sleep(5)
# BUTTON
btn = driver.find_element_by_xpath('/html/body/main/div/header/div[5]/div[1]/div[1]/div')
btn.click()
time.sleep(5)
# AUTO-SCROLLING
# -- make the parsing time of python is equivalent to the web
temp_height=0
while True:
driver.execute_script("window.scrollBy(0,1000)")
time.sleep(10)
check_height = driver.execute_script("return document.documentElement.scrollTop || window.pageYOffset || document.body.scrollTop;")
if check_height==temp_height:
break
temp_height=check_height
time.sleep(5)
# AUTO-SCROLL end
soup = BeautifulSoup(driver.page_source, "html.parser")
product_card = soup.select('.fvPLPProducts > li')
for allproduct in product_card:
productDetails = product_info(allproduct)
records.append(productDetails)
col = ['Name', 'Price', 'Image', 'Brand', 'Discount']
all_data = pd.DataFrame(records, columns=col)
all_data.to_csv('D:\\FASHION-{}.csv'.format(product))
This is the output, after I ran main("duck")
the error come out like this,
AttributeError Traceback (most recent call last)
<ipython-input-7-7b75c58eb0da> in <module>
----> 1 main("duck")
<ipython-input-6-7d068e5049f6> in main(product)
70
71 for allproduct in product_card:
---> 72 productDetails = product_info(allproduct)
73 records.append(productDetails)
74
<ipython-input-6-7d068e5049f6> in product_info(card)
20
21 #p_rice = card.find("p", "fvPLPProductPrice").select("strong")
---> 22 p_price = card.select_one('.fvPLPProductPrice > strong').text.strip()
23
24 # image
AttributeError: 'NoneType' object has no attribute 'text
`
I have tried to remove the text.strip()
, it works well, but the output is including the tag from the HTML code which is not what I want.
As conclusion, the .text.strip()
is worked when separate the code, but it become error as I merge it all.
Anyone can help me? Thank you.