I'm building a web scraper to automate the process of downloading tweet data using selenium and the headless chrome browser.
I've written a function which logs into twitter, navigates to the analytics page and downloads the csv file, but is there any way to use the pandas.read_csv function to read csv from the source directly without downloading as an intermediary step? I'm pushing data to a SQL database and eventually want to schedule on AWS Lambda so would be good if I could eliminate the need for creating new files.
code as follows (twt is how i've called TwitterBrowser() in the if name == "main": line)
class TwitterBrowser:
def __init__(self):
global LOGIN, PASSWORD, browser
chrome_options = Options()
chrome_options.add_argument("--incognito")
chrome_driver = os.getcwd() +"\\chromedriver.exe"
browser = webdriver.Chrome(chrome_options=chrome_options, executable_path=chrome_driver)
parser = ConfigParser()
parser.read("apikeys.ini")
LOGIN = parser.get('TWITTER', 'USERNAME')
PASSWORD = parser.get('TWITTER', 'PASSWORD')
def get_url(self, url, sec):
load_page = browser.get(url)
try:
WebDriverWait(browser, timeout=sec)
except TimeoutException:
print('TIMED OUT!')
return load_page
def login(self):
twt.get_url('https://twitter.com/login', 5)
browser.find_element_by_xpath('//*[@id="page-container"]/div/div[1]/form/fieldset/div[1]/input').send_keys(LOGIN)
browser.find_element_by_xpath('//*[@id="page-container"]/div/div[1]/form/fieldset/div[2]/input').send_keys(PASSWORD)
WebDriverWait(browser, 5)
browser.find_element_by_xpath('//*[@id="page-container"]/div/div[1]/form/div[2]/button').click()
def tweet_analytics(self):
twt.get_url('https://analytics.twitter.com/user/'+LOGIN+'/tweets', 5)
WebDriverWait(browser, 5)
browser.find_element_by_xpath('/html/body/div[2]/div/div[2]/div').click()
WebDriverWait(browser, 5)
browser.find_element_by_xpath('/html/body/div[5]/div[4]/ul/li[1]').click()
WebDriverWait(browser, 5)
browser.find_element_by_xpath('//*[@id="export"]/button/span[2]').click()
WebDriverWait(browser, 10)