This website https://findmasa.com/city/los-angeles/ contains many murals. I want to use python and extract information from the subpages that pop up when clicking the address button, such as https://findmasa.com/view/map#b1cc410b. The information I want to get includes mural id, artist, address, city, latitude, longitude, and link.
When I run the code below, it worked for the first four subpages but stopped at the fifth at this sublink https://findmasa.com/view/map#1456a64a and gave me an error message selenium.common.exceptions.InvalidSelectorException: Message: invalid selector: An invalid or illegal selector was specified (Session info: chrome=114.0.5735.199)
. Can anyone help me identify the problem and provide a solution? Thank you.
from requests_html import HTMLSession
import warnings
import csv
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
warnings.filterwarnings("ignore", category=DeprecationWarning) ## ignore the Deprecation warning message
s = HTMLSession()
## define a function to get mural links from different categories
def get_mural_links(page):
url = f'https://findmasa.com/city/los-angeles/{page}'
links = []
r = s.get(url)
artworks = r.html.find('ul.list-works-cards div.top p')
for item in artworks:
links.append(item.find('a', first=True).attrs['href'])
return links
## define a function to get interested info from a list of links
def parse_mural(url):
## get mural id
spl = '#'
id = url.partition(spl)[2]
## create a Chrome driver instance
driver = Chrome()
driver.get(url)
# wait for the li element to be present on the page
li_element = WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CSS_SELECTOR, f'li#{id}')))
data_lat = li_element.get_attribute('data-lat')
data_lng = li_element.get_attribute('data-lng')
city = li_element.find_elements(By.TAG_NAME, 'p')[2].text
link = url
try:
artist = li_element.find_element(By.TAG_NAME, 'a').text
except:
artist = 'No Data'
try:
address = li_element.find_elements(By.TAG_NAME, 'p')[1].text
except:
address = 'No Data'
info = {
'ID': id,
'ARTIST': artist,
'LOCATION': address,
'CITY': city,
'LATITUDE': data_lat,
'LONGITUDE': data_lng,
'LINK': link,
}
return info
## define a function to save the results to a csv file
def save_csv(results):
keys = results[0].keys()
with open('LAmural_MASA.csv', 'w', newline='') as f: ## newline='' helps remove the blank rows in b/t each mural
dict_writer = csv.DictWriter(f, keys)
dict_writer.writeheader()
dict_writer.writerows(results)
## define the main function for this file to export results
def main():
results = []
for x in range(1, 3):
urls = get_mural_links(x)
for url in range(len(urls)):
results.append(parse_mural(urls[url]))
save_csv(results)
## won't run/import to other files
if __name__ == '__main__':
main()