I have started scraping reviews from e-commerce platform and perform sentiment analysis and share it with people on my blog to make the life of people easier and understand everything about the product in just one article. I am using python packages like selenium and bs4. Here is my code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from contextlib import closing
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import requests
import re
from bs4 import BeautifulSoup
def remove_non_ascii_1(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
with closing(Firefox()) as browser:
site = "https://www.flipkart.com/honor-8-pro-midnight-black-128-gb/product-reviews/itmeymafrghbjcpf?page=1&pid=MOBEWXHMVYBBMZGJ"
browser.get(site)
file = open("review.txt", "w")
for count in range(1, 100):
nav_btns = browser.find_elements_by_class_name('_33m_Yg')
button = ""
for btn in nav_btns:
number = int(btn.text)
if(number==count):
button = btn
break
button.send_keys(Keys.RETURN)
WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))
read_more_btns = browser.find_elements_by_class_name('_1EPkIx')
for rm in read_more_btns:
browser.execute_script("return arguments[0].scrollIntoView();", rm)
browser.execute_script("window.scrollBy(0, -150);")
rm.click()
page_source = browser.page_source
soup = BeautifulSoup(page_source, "lxml")
ans = soup.find_all("div", class_="_3DCdKt")
for tag in ans:
title = str(tag.find("p", class_="_2xg6Ul").string).replace(u"\u2018", "'").replace(u"\u2019", "'")
title = remove_non_ascii_1(title)
title.encode('ascii','ignore')
content = tag.find("div", class_="qwjRop").div.prettify().replace(u"\u2018", "'").replace(u"\u2019", "'")
content = remove_non_ascii_1(content)
content.encode('ascii','ignore')
content = content[15:-7]
votes = tag.find_all("span", class_="_1_BQL8")
upvotes = int(votes[0].string)
downvotes = int(votes[1].string)
file.write("Review Title : %s\n\n" % title )
file.write("Upvotes : " + str(upvotes) + "\n\nDownvotes : " + str(downvotes) + "\n\n")
file.write("Review Content :\n%s\n\n\n\n" % content )
file.close()
The code is working fine on platform like Amazon, but on Flipkart, after crawling 14 pages I get an error saying "Someting is Wrong!!!" and the crawling stops. In command line I get this error:
C:\Users\prate\Desktop\Crawler\Git_Crawler\New>python scrape.py Traceback (most recent call last): File "scrape.py", line 37, in WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul"))) File "C:\Users\prate\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until raise TimeoutException(message, screen, stacktrace) selenium.common.exceptions.TimeoutException: Message:
There is no message printed as well. I think if I increase the request time interval on the platform it might let me crawl. What should I do?