I want to scrape news headlines from this page: https://www.forexfactory.com/news
while scrolling down and clicking on more button.
I tried requests and bs4 but didn't return data:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'}
u = 'https://www.forexfactory.com/news'
session = requests.Session()
r = session.get(u, timeout=30, headers=headers) # print(r.status_code)
soup = BeautifulSoup(r.content, 'html.parser')
soup.select('.flexposts__item.flexposts__story') # return []
print(r.status_code) #return 503
I checked Network button on the console and find other urls which return raw response data:
I tried using requests but same response: 503
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'}
u = 'https://www.forexfactory.com/flex.php?more=2'
session = requests.Session()
r = session.get(u, timeout=30, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
print(r.status_code) #return 503
print(r.text) #return html but without the headlines content
soup.select('.flexposts__item.flexposts__story') # return []
I also tried selenium but same, don't return headlines elements
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(executable_path=r"C:/chromedriver.exe", options=options)
u = 'https://www.forexfactory.com/news'
driver.get(u)
driver.implicitly_wait(60)
driver.find_elements(By.CSS_SELECTOR, '.flexposts__item.flexposts__story') # return []
soup = BeautifulSoup(driver.page_source, 'html.parser')
soup.select('.flexposts__item.flexposts__story') # return []
driver.quit()