The problem using the post requests as suggested is that the request needs a authorization token which has an expiry time on. You can see the post request in Chrome or Firefox if you right click on the page -> select Inspect
-> select Network
then select an Industry
click on the POST request and click on Cookies
there is a cookie password_grant_custom.client.expires
which has a timestamp of when the Authorization will no longer work.
However you can use selenium to scrape data off all pages.
First install Selenium:
`sudo pip3 install selenium` on Linux or `pip install selenium` on Windows
Then get a driver https://sites.google.com/a/chromium.org/chromedriver/downloads,
get the right one for your version of Chrome and extract it from the zip file.
Note on Windows you will need to add the path to you chromedriver to
driver = webdriver.Chrome(options=options)
On Linux copy chromedriver to /usr/local/bin/chromedriver
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
# Start with the driver maximised to see the drop down menus properly
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)
driver.get('https://www.gurufocus.com/insider/summary')
# Set the page size to 100 to reduce page loads
driver.find_element_by_xpath("//span[contains(text(),'40 / Page')]").click()
wait = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((
By.XPATH,
"//div[contains(text(),'100')]"))
)
element = driver.find_element_by_xpath("//div[contains(text(),'100')]").click()
# Wait for the page to load and don't overload the server
time.sleep(2)
# select Industry
driver.find_element_by_xpath("//span[contains(text(),'Industry')]").click()
# Select Financial Services
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((
By.XPATH,
"//span[contains(text(),'Financial Services')]"))
)
element.click()
ticker = []
while True:
# Wait for the page to load and don't overload the server
time.sleep(6)
# Parse the HTML
soup = BeautifulSoup(driver.page_source, 'html.parser')
for tk in soup.find_all('td', {'class': 'table-stock-info', 'data-column': 'Ticker'}):
ticker.append(tk.text)
try:
# Move to the next page
element = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn-next')))
element.click()
except TimeoutException as ex:
# No more pages so break
break
driver.quit()
print(len(ticker))
print(ticker)
Output
4604
['PUB ', 'ARES ', 'EIM ', 'CZNC ', 'SSB ', 'CNA ', 'TURN ', 'FNF ', 'EGIF ', 'NWPP etc...
UPDATED
If you want to scrape all the data off all the pages and/or write to a csv use pandas:
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time
# Start with the driver maximised to see the drop down menus properly
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)
driver.get('https://www.gurufocus.com/insider/summary')
# Set the page size to 100 to reduce page loads
driver.find_element_by_xpath("//span[contains(text(),'40 / Page')]").click()
wait = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((
By.XPATH,
"//div[contains(text(),'100')]"))
)
driver.find_element_by_xpath("//div[contains(text(),'100')]").click()
# Wait for the page to load and don't overload the server
time.sleep(2)
# select Industry
driver.find_element_by_xpath("//span[contains(text(),'Industry')]").click()
# Select Financial Services
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((
By.XPATH,
"//span[contains(text(),'Financial Services')]"))
)
element.click()
columns = [
'Ticker', 'Links', 'Company', 'Price1', 'Insider Name', 'Insider Position',
'Date', 'Buy/Sell', 'Insider Trading Shares', 'Shares Change', 'Price2',
'Cost(000)', 'Final Share', 'Price Change Since Insider Trade (%)',
'Dividend Yield %', 'PE Ratio', 'Market Cap ($M)', 'None'
]
df = pd.DataFrame(columns=columns)
while True:
# Wait for the page to load and don't overload the server
time.sleep(6)
# Parse the HTML
df = df.append(pd.read_html(driver.page_source, attrs={'class': 'data-table'})[0], ignore_index=True)
try:
# Move to the next page
element = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn-next')))
element.click()
except TimeoutException as ex:
# No more pages so break
break
driver.quit()
# Write to csv
df.to_csv("Financial_Services.csv", encoding='utf-8', index=False)
Updated in response to comments:
First download the Firefox driver geckodriver from https://github.com/mozilla/geckodriver/releases extract the driver. Again on Windows you will need to add the path to your geckodriver to driver = webdriver.Firefox()
or on linux copy geckodriver to /usr/local/bin/geckodriver
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time
# Start with the driver maximised to see the drop down menus properly
driver = webdriver.Firefox()
driver.maximize_window()
driver.get('https://www.gurufocus.com/insider/summary')
# Set the page size to 100 to reduce page loads
driver.find_element_by_xpath("//span[contains(text(),'40 / Page')]").click()
wait = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((
By.XPATH,
"//div[contains(text(),'100')]"))
)
driver.find_element_by_xpath("//div[contains(text(),'100')]").click()
# Wait for the page to load and don't overload the server
time.sleep(2)
# select Industry
driver.find_element_by_xpath("//span[contains(text(),'Industry')]").click()
# Select Financial Services
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((
By.XPATH,
"//span[contains(text(),'Financial Services')]"))
)
element.click()
columns = [
'Ticker', 'Links', 'Company', 'Price1', 'Insider Name', 'Insider Position',
'Date', 'Buy/Sell', 'Insider Trading Shares', 'Shares Change', 'Price2',
'Cost(000)', 'Final Share', 'Price Change Since Insider Trade (%)',
'Dividend Yield %', 'PE Ratio', 'Market Cap ($M)', 'None'
]
df = pd.DataFrame(columns=columns)
page_limit = 5
page = 0
while True:
# Wait for the page to load and don't overload the server
time.sleep(6)
# Parse the HTML
df = df.append(pd.read_html(driver.page_source, attrs={'class': 'data-table'})[0], ignore_index=True)
# Stop after page limit is reached.
page = page + 1
if page >= page_limit:
break
try:
# Move to the next page
element = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn-next')))
element.click()
except TimeoutException as ex:
# No more pages so break
break
driver.quit()
# Write to csv
df.to_csv("Financial_Services.csv", encoding='utf-8', index=False)