I am making a function to print a list of links so I can add them to a list of companies and job titles. However, I am having difficulties navigating tag sub-contents. I am looking to list all the 'href' in 'a' in 'div' like so:
from bs4 import BeautifulSoup
import re
import pandas as pd
import requests
page = "https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html"
headers = {'User-Agent':'Mozilla/5.0'}
def get_soup():
session = requests.Session()
pageTree = session.get(page, headers=headers)
return BeautifulSoup(pageTree.content, 'html.parser')
pageSoup = get_soup()
def print_links():
"""this function scrapes the job title links"""
jobLink = [div.a for div in pageSoup.find_all('div', class_='title')]
for div in jobLink:
print(div['href'])
I am trying to make a list but my result is simply text and does not seem to be a link like so:
/pagead/clk?mo=r&ad=-6NYlbfkN0DhVAxkc_TxySVbUOs6bxWYWOfhmDTNcVTjFFBAY1FXZ2RjSBnfHw4gS8ZdlOOq-xx2DHOyKEivyG9C4fWOSDdPgVbQFdESBaF5zEV59bYpeWJ9R8nSuJEszmv8ERYVwxWiRnVrVe6sJXmDYTevCgexdm0WsnEsGomjLSDeJsGsHFLAkovPur-rE7pCorqQMUeSz8p08N_WY8kARDzUa4tPOVSr0rQf5czrxiJ9OU0pwQBfCHLDDGoyUdvhtXy8RlOH7lu3WEU71VtjxbT1vPHPbOZ1DdjkMhhhxq_DptjQdUk_QKcge3Ao7S3VVmPrvkpK0uFlA0tm3f4AuVawEAp4cOUH6jfWSBiGH7G66-bi8UHYIQm1UIiCU48Yd_pe24hfwv5Hc4Gj9QRAAr8ZBytYGa5U8z-2hrv2GaHe8I0wWBaFn_m_J10ikxFbh6splYGOOTfKnoLyt2LcUis-kRGecfvtGd1b8hWz7-xYrYkbvs5fdUJP_hDAFGIdnZHVJUitlhjgKyYDIDMJ-QL4aPUA-QPu-KTB3EKdHqCgQUWvQud4JC2Fd8VXDKig6mQcmHhZEed-6qjx5PYoSifi5wtRDyoSpkkBx39UO3F918tybwIbYQ2TSmgCHzGm32J4Ny7zPt8MPxowRw==&p=0&fvj=1&vjs=3
Additionally, here is my attempt at making a list with the links:
def get_job_titles():
"""this function scrapes the job titles"""
jobs = []
jobTitle = pageSoup.find_all('div', class_='title')
for span in jobTitle:
link = span.find('href')
if link:
jobs.append({'title':link.text,
'href':link.attrs['href']})
else:
jobs.append({'title':span.text, 'href':None})
return jobs