0

I'm trying to get the Urls (lnk) and the paragraphs (txt) extract form python script below into a csv with pandas.

For some reason the generated csv returns the headers (lnk and txt) and the Urls only, but not the corresponding paragraphs. The CSV file returns currently that

lnk    | txt
url 1  | 
url 2  |

What I need would be

lnk    | txt
url 1  | text 1
url 2  | text 2

But both the Urls and the paragraphs do get printed in the cmd console.

Why doesn't the paragraphs get exported into the csv as well? What would be a working fix to this problem? thanks.

(sorry for the long code, I'm new to Python)

from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

url_txt = []


#GET TEXT
def getPageText(url):
    # given a url, get page content
    data = urlopen(url).read()
    # parse as html structured document
    soup = BeautifulSoup(data, 'html.parser')
    # kill javascript content
    for s in soup(["script", "style"]):
        s.replaceWith('')
    #remove the text from this class
    soup.find('p', {"class":"tiny-off"}).decompose()
    #remove the text from this div id
    soup.find('div', id = 'action-bar-top').decompose()
    #remove the text from this div id
    soup.find('div', id = 'main-content-sidebar').decompose()
    #remove the text from this class
    soup.find('div', {"class":"legal"}).decompose()
    #get the 1st paragraph (which is a link)
    for p in soup.find_all('p')[0]:
        lnk = p.get_text()
        print(lnk)
    #remove the 1st paragraph (the link) from the following combined paragraphs
    soup.find('p', id = 'current_url').decompose()
    #extract all paragraphs save the 1st (the link)
    for p in soup.find_all('p'):
        txt = p.get_text().replace("\r", "").replace("\n", "")
        print(txt)
    
    # Compiling the info
    lnktxt_data = [lnk, txt]

    # Append the info to the complete dataset
    url_txt.append(lnktxt_data)
    
#Get text from multiple urls    
def main():
    urls = [
        'https://stackoverflow.com/questions/63400153/how-to-export-pandas-dataframe-into-csv-file',       #dummy page
        'https://stackoverflow.com/questions/52716762/how-to-join-newlines-into-a-paragraph-in-python'     #dummy page
    ]
    txt = [getPageText(url) for url in urls]
    for t in txt:
        print(t)
    
if __name__=="__main__":
    main()
    
#FRAME DATA
# Making the dataframe
url_txt = pd.DataFrame(url_txt, columns = ['lnk', 'txt'])
 
url_txt.head()
    
#CREATE A FILE
# Save as CSV File
url_txt.to_csv('url_txt.csv',index=False)
Lod
  • 657
  • 1
  • 9
  • 30

1 Answers1

0

I've found a simpler way that's working (with room for improvement), with help from those two previous answers

How to join newlines into a paragraph in python

How to scrape web news and combine paragraphs into each article

Please let me know below how you would improve it if you find a better way.

from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd


#GET TEXT
def getPageText(url):
    # given a url, get page content
    data = urlopen(url).read()
    # parse as html structured document
    soup = BeautifulSoup(data, 'html.parser')
    # kill javascript content
    for s in soup(["script", "style"]):
        s.replaceWith('')
    #
    for p in soup.find_all('p')[1]:
        lnk = p.get_text()
        print(lnk)
    #
    # find body and extract text
    p = soup.find("div", attrs={'class': 'article-content retro-folders'})
    p.append(p.get_text())
    x = p.text
    y = x.replace("\r", "").replace("\n", "")
    print(y)
    

    # Compiling the info
    lnktxt_data = [lnk, y]

    # Append the info to the complete dataset
    url_txt.append(lnktxt_data)


url_txt = []


#Get text from multiple urls    
def main():
    urls = [
        'https://stackoverflow.com/questions/63400153/how-to-export-pandas-dataframe-into-csv-file',       #dummy page
        'https://stackoverflow.com/questions/52716762/how-to-join-newlines-into-a-paragraph-in-python'     #dummy page
    ]
    txt = [getPageText(url) for url in urls]
    for t in txt:
        print(t)
    
if __name__=="__main__":
    main()
    
#FRAME DATA
# Making the dataframe
url_txt = pd.DataFrame(url_txt, columns = ['lnk', 'y'])
 
url_txt.head()
    
#CREATE A FILE
# Save as CSV File
url_txt.to_csv('url_txt.csv',index=False)
Lod
  • 657
  • 1
  • 9
  • 30
  • New script with delays and user-agent version: https://web.archive.org/web/20220219104129/https://pastebin.com/3GLRG4rM – Lod Feb 19 '22 at 10:42