I'm trying to scrape https://arxiv.org/search/?query=healthcare&searchtype=allI through the Selenium and python. The for loop takes too long to execute. I tried to scrape with headless browsers and PhantomJS, but it doesnt scrape the abstract field (Need the abstract field expanded with the more button clicked)
import pandas as pd
import selenium
import re
import time
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import Firefox
browser = Firefox()
url_healthcare = 'https://arxiv.org/search/?query=healthcare&searchtype=all'
browser.get(url_healthcare)
dfs = []
for i in range(1, 39):
articles = browser.find_elements_by_tag_name('li[class="arxiv-result"]')
for article in articles:
title = article.find_element_by_tag_name('p[class="title is-5 mathjax"]').text
arxiv_id = article.find_element_by_tag_name('a').text.replace('arXiv:','')
arxiv_link = article.find_elements_by_tag_name('a')[0].get_attribute('href')
pdf_link = article.find_elements_by_tag_name('a')[1].get_attribute('href')
authors = article.find_element_by_tag_name('p[class="authors"]').text.replace('Authors:','')
try:
link1 = browser.find_element_by_link_text('▽ More')
link1.click()
except:
time.sleep(0.1)
abstract = article.find_element_by_tag_name('p[class="abstract mathjax"]').text
date = article.find_element_by_tag_name('p[class="is-size-7"]').text
date = re.split(r"Submitted|;",date)[1]
tag = article.find_element_by_tag_name('div[class="tags is-inline-block"]').text.replace('\n', ',')
try:
doi = article.find_element_by_tag_name('div[class="tags has-addons"]').text
doi = re.split(r'\s', doi)[1]
except NoSuchElementException:
doi = 'None'
all_combined = [title, arxiv_id, arxiv_link, pdf_link, authors, abstract, date, tag, doi]
dfs.append(all_combined)
print('Finished Extracting Page:', i)
try:
link2 = browser.find_element_by_class_name('pagination-next')
link2.click()
except:
browser.close
time.sleep(0.1)