I have written a code to scrape data from a site and then it converts that data into a pandas dataframe . Cleans it up and then Sends it to AWS S2 where the data can get stored
import requests
import pandas as pd
from datetime import datetime
from datetime import date
import json
import smart_open
def save_to_s3(s3_file_location, data_item):
with smart_open.open(s3_file_location, "w") as out_file:
for data in data_item:
out_file.write(json.dumps(data))
out_file.write("\n")
print(f"Data saved to {s3_file_location}")
def pulling_corporate_announcements(event=None,context=None):
print("Started Pulling")
currentd = date.today()
s = requests.Session()
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
url = 'https://www.nseindia.com/'
step = s.get(url,headers=headers)
step = s.get(url)
today = datetime.now().strftime('%d-%m-%Y')
api_url = f'https://www.nseindia.com/api/corporate-announcements?index=equities&from_date={today}&to_date={today}'
resp = s.get(api_url,headers=headers).json()
resp = s.get(api_url).json()
result = pd.DataFrame(resp)
result.drop(['difference', 'dt','exchdisstime','csvName','old_new','orgid','seq_id','sm_isin','bflag','symbol','sort_date'], axis = 1, inplace = True)
result.rename(columns = {'an_dt':'DateandTime', 'attchmntFile':'Source','attchmntText':'Topic','desc':'Type','smIndustry':'Sector','sm_name':'Company Name'}, inplace = True)
result[['Date','Time']] = result.DateandTime.str.split(expand=True)
result = result[result['Type'].str.contains("Loss of Share Certificates|Copy of Newspaper Publication") == False]
result['Type'] = result['Type'].astype(str)
result['Type'].replace("Certificate under SEBI (Depositories and Participants) Regulations, 2018",'Junk' , inplace = True)
result = result[result['Type'].str.contains("Junk") == False]
result = result[result["Type"].str.contains("Trading Window") == False]
result.drop_duplicates(subset='Source', keep = 'first', inplace = True)
result['Temporary']=pd.to_datetime(result['Date']+' '+result['Time'])
result['Date']=result['Temporary'].dt.strftime('%b %d, %Y')
result['Time']=result['Temporary'].dt.strftime('%R %p')
result['DateTime'] = pd.to_datetime(result['Temporary'])
result['DateTime'] = result['Temporary'].dt.strftime('%m/%d/%Y %I:%M %p')
result.drop(['DateandTime', 'Temporary'], axis = 1, inplace = True)
file_name = ( str(currentd.day) +'-'+str(currentd.month) +'-'+'CA.csv')
s3_location = "s3://corpanc/" + file_name
save_to_s3(s3_location,result)
print('Saved the CSV File')
This code works perfectly in my local windows 10 although when i uploaded it to AWS lambda its giving me this error. I tried all the ways to install numpy and then make a ZIP file and upload it. Though its not working only .I have also tried to add a numpy layer still error persists .Error message -
[ERROR] Runtime.ImportModuleError: Unable to import module 'coranc': Unable to import required dependencies:
numpy:
IMPORTANT: PLEASE READ THIS FOR ADVICE ON HOW TO SOLVE THIS ISSUE!
Importing the numpy C-extensions failed. This error can happen for
many reasons, often due to issues with your setup or how NumPy was
installed.
We have compiled some common reasons and troubleshooting tips at:
https://numpy.org/devdocs/user/troubleshooting-importerror.html
Please note and check the following:
* The Python version is: Python3.9 from "/var/lang/bin/python3.9"
* The NumPy version is: "1.22.3"
and make sure that they are the versions you expect.
Please carefully study the documentation linked above for further help.
Original error was: No module named 'numpy.core._multiarray_umath
'