I am using python and beautiful soup to scrape the following page:
url = "https://www.healthline.com/nutrition/13-anti-inflammatory-foods"
# Send an HTTP request to the URL and get the HTML content
response = requests.get(url)
html_content = response.text
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
from bs4 import BeautifulSoup
import requests
# Find all anchor tags <a> and paragraph tags <p> on the page
anchor_tags = soup.find_all('a')
paragraph_tags = soup.find_all('p')
# Function to recursively extract text from a tag with nested tags
def extract_tag_text(tag):
tag_text = ""
for element in tag.contents:
if isinstance(element, str): # If the element is a string, add it to the text
tag_text += element
else: # Recursively extract text from nested tags
tag_text += extract_tag_text(element)
return tag_text.strip()
# Initialize a list to store the extracted text from 'a' and 'p' tags
extracted_text = []
# Extract text from 'a' tags
for anchor_tag in anchor_tags:
anchor_text = extract_tag_text(anchor_tag)
if anchor_text.strip():
extracted_text.append(anchor_text)
# Extract text from 'p' tags
for paragraph_tag in paragraph_tags:
paragraph_text = extract_tag_text(paragraph_tag)
if paragraph_text.strip():
extracted_text.append(paragraph_text)
# Combine the extracted text into a single string
all_text = "\n".join(extracted_text)
# Print the extracted text
print(all_text)
while the above code that I have written gives me the text in this page, i would like to correctly match each header with its respective text. and the list to appear as a list and if there is no header, then just the text.