I'm trying to get the Urls (lnk) and the paragraphs (txt) extract form python script below into a csv with pandas.
For some reason the generated csv returns the headers (lnk and txt) and the Urls only, but not the corresponding paragraphs. The CSV file returns currently that
lnk | txt
url 1 |
url 2 |
What I need would be
lnk | txt
url 1 | text 1
url 2 | text 2
But both the Urls and the paragraphs do get printed in the cmd console.
Why doesn't the paragraphs get exported into the csv as well? What would be a working fix to this problem? thanks.
(sorry for the long code, I'm new to Python)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
url_txt = []
#GET TEXT
def getPageText(url):
# given a url, get page content
data = urlopen(url).read()
# parse as html structured document
soup = BeautifulSoup(data, 'html.parser')
# kill javascript content
for s in soup(["script", "style"]):
s.replaceWith('')
#remove the text from this class
soup.find('p', {"class":"tiny-off"}).decompose()
#remove the text from this div id
soup.find('div', id = 'action-bar-top').decompose()
#remove the text from this div id
soup.find('div', id = 'main-content-sidebar').decompose()
#remove the text from this class
soup.find('div', {"class":"legal"}).decompose()
#get the 1st paragraph (which is a link)
for p in soup.find_all('p')[0]:
lnk = p.get_text()
print(lnk)
#remove the 1st paragraph (the link) from the following combined paragraphs
soup.find('p', id = 'current_url').decompose()
#extract all paragraphs save the 1st (the link)
for p in soup.find_all('p'):
txt = p.get_text().replace("\r", "").replace("\n", "")
print(txt)
# Compiling the info
lnktxt_data = [lnk, txt]
# Append the info to the complete dataset
url_txt.append(lnktxt_data)
#Get text from multiple urls
def main():
urls = [
'https://stackoverflow.com/questions/63400153/how-to-export-pandas-dataframe-into-csv-file', #dummy page
'https://stackoverflow.com/questions/52716762/how-to-join-newlines-into-a-paragraph-in-python' #dummy page
]
txt = [getPageText(url) for url in urls]
for t in txt:
print(t)
if __name__=="__main__":
main()
#FRAME DATA
# Making the dataframe
url_txt = pd.DataFrame(url_txt, columns = ['lnk', 'txt'])
url_txt.head()
#CREATE A FILE
# Save as CSV File
url_txt.to_csv('url_txt.csv',index=False)