I have been trying to webscrape two websites for data but facing issues. I will be extremely glad if anyone can help in resolving the problem
1.https://online.capitalcube.com/ The website requires one to login. I came up with the following code after watching tutorials on youtube for the last 2 days.
from bs4 import BeautifulSoup
import pandas as pd
import requests
URL = 'https://online.capitalcube.com/'
LOGIN_ROUTE = '/login'
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:93.0) Gecko/20100101 Firefox/93.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'TE': 'trailers',
}
s = requests.session()
login_payload = {
'email': '<intentionally removed it>',
'password': '<intentionally removed it>'
}
login_req = s.post(URL + LOGIN_ROUTE, headers = headers, data = login_payload)
print(login_req.status_code)
The error i am getting is as follows
*Traceback (most recent call last): File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/urllib3/connectionpool.py", line 699, in urlopen httplib_response = self._make_request( File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/urllib3/connectionpool.py", line 382, in _make_request self._validate_conn(conn) File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/urllib3/connectionpool.py", line 1010, in validate_conn conn.connect() File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/urllib3/connection.py", line 416, in connect self.sock = ssl_wrap_socket( File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/urllib3/util/ssl.py", line 449, in ssl_wrap_socket ssl_sock = ssl_wrap_socket_impl( File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/urllib3/util/ssl.py", line 493, in _ssl_wrap_socket_impl return ssl_context.wrap_socket(sock, server_hostname=server_hostname) File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/ssl.py", line 500, in wrap_socket return self.sslsocket_class._create( File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/ssl.py", line 1040, in _create self.do_handshake() File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/ssl.py", line 1309, in do_handshake self._sslobj.do_handshake() ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129) During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/requests/adapters.py", line 439, in send resp = conn.urlopen( File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/urllib3/connectionpool.py", line 755, in urlopen retries = retries.increment( File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/urllib3/util/retry.py", line 574, in increment raise MaxRetryError(_pool, url, error or ResponseError(cause)) urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='online.capitalcube.com', port=443): Max retries exceeded with url: //login (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)'))) During handling of the above exception, another exception occurred: Traceback (most recent call last): File "", line 30, in File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/requests/sessions.py", line 590, in post return self.request('POST', url, data=data, json=json, **kwargs) File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/requests/sessions.py", line 542, in request resp = self.send(prep, **send_kwargs) File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/requests/sessions.py", line 655, in send r = adapter.send(request, *kwargs) File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/requests/adapters.py", line 514, in send raise SSLError(e, request=request) requests.exceptions.SSLError: HTTPSConnectionPool(host='online.capitalcube.com', port=443): Max retries exceeded with url: //login (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))
- The other website I am trying is stockedge.com I have come up with the following code
import requests
from bs4 import BeautifulSoup
import pandas as pd
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:93.0) Gecko/20100101 Firefox/93.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Connection': 'keep-alive',
}
ticker = 'hdfc-bank/5051?'
urls = {}
urls['balancesheet consolidated'] = f"https://web.stockedge.com/share/{ticker}section=balance-sheet"
urls['balancesheet standalone'] = f"https://web.stockedge.com/share/{ticker}section=balance-sheet&statement-type=standalone"
urls['profitloss consolidated'] = f"https://web.stockedge.com/share/{ticker}section=profit-loss&statement-type=consolidated"
urls['profitloss standalone'] = f"https://web.stockedge.com/share/{ticker}section=profit-loss&statement-type=standalone"
urls['cashflow consolidated'] = f"https://web.stockedge.com/share/{ticker}section=cash-flow"
urls['cashflow standalone'] = f"https://web.stockedge.com/share/{ticker}section=cash-flow&statement-type=standalone"
urls['quarterlyresults consolidated'] = f"https://web.stockedge.com/share/{ticker}section=results"
urls['quarterlyresults standalone'] = f"https://web.stockedge.com/share/{ticker}section=results&active-statement-type=Standalone"
urls['shareholding pattern'] = f"https://web.stockedge.com/share/{ticker}section=pattern"
urls['return ratios'] = f"https://web.stockedge.com/share/{ticker}section=ratios&ratio-id=roe"
urls['efficiency ratios'] = f"https://web.stockedge.com/share/{ticker}section=ratios&ratio-id=roe&ratio-category=efficiencyratios"
urls['growth ratios'] = f"https://web.stockedge.com/share/{ticker}section=ratios&ratio-id=roe&ratio-category=growthratios"
urls['solvency ratios'] = f"https://web.stockedge.com/share/{ticker}section=ratios&ratio-id=net_sales_growth&ratio-category=solvencyratios"
urls['cashflow ratios'] = f"https://web.stockedge.com/share/{ticker}section=ratios&ratio-id=net_sales_growth&ratio-category=cashflowratios"
urls['valuation ratios'] = f"https://web.stockedge.com/share/{ticker}section=ratios&ratio-id=net_sales_growth&ratio-category=valuationratios"
xlwriter = pd.ExcelWriter(f'financial statements ({ticker}).xlsx', engine='xlsxwriter')
for key in urls.keys():
response = requests.get(urls[key], headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
df = pd.read_html(str(soup), attrs={'class': 'background md list-md hydrated'})[0]
df.to_excel(xlwriter, sheet_name=key, index=False)
xlwriter.save()
The error I am getting is
runfile('/Users/rafatsiddiqui/Downloads/scientificProject/Company Financial Webscrape.py', wdir='/Users/rafatsiddiqui/Downloads/scientificProject') Traceback (most recent call last): File "", line 1, in File "/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydev_bundle/pydev_umd.py", line 198, in runfile pydev_imports.execfile(filename, global_vars, local_vars) # execute the script File "/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile exec(compile(contents+"\n", file, 'exec'), glob, loc) File "/Users/rafatsiddiqui/Downloads/scientificProject/Company Financial Webscrape.py", line 36, in xlwriter = pd.ExcelWriter(f'financial statements ({ticker}).xlsx', engine='xlsxwriter') File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/pandas/io/excel/_xlsxwriter.py", line 191, in init super().init( File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/pandas/io/excel/_base.py", line 925, in init self.handles = get_handle( File "/Users/rafatsiddiqui/opt/anaconda3/envs/scientificProject/lib/python3.9/site-packages/pandas/io/common.py", line 711, in get_handle handle = open(handle, ioargs.mode) FileNotFoundError: [Errno 2] No such file or directory: 'financial statements (hdfc-bank/5051?).xlsx'