1
import pandas as pd
import glob
import csv
import re
from bs4 import BeautifulSoup
links_with_text = []
textfile = open("a_file.txt", "w")
 for filename in glob.iglob('*.html'):
  with open(filename) as f:
    soup = BeautifulSoup(f)


    links_with_text = [a['href'] for a in soup.find_all('a', href=True) if a.text]

    print(links_with_text)
    
    for element in links_with_text:
      textfile.write(element + "\n")
    

sample Output:

file name:

  • link1
  • link2
  • link3

file name2:

  • link1
  • link2
  • link3

file name3:

  • link1
  • link2
  • link3

I found a post some what related to mine but there it prints the output in multiple text files but here I would like to have those file names with their links in one textfile.

BeautifulSoup on multiple .html files

Please suggest. Thank you in advance

Revanth
  • 51
  • 1
  • 1
  • 10

3 Answers3

1

I made a similar thing but with img maybe it will help you:

link = input('Url is: ')
html = urlopen(link)
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src':re.compile('.jpg')})
f= open("cache.txt","w+")
for image in images: 
    url = ('https:' + image['src']+'\n')
    f.write(url)

with open('cache.txt') as f:
   for line in f:
      url = line
      path = 'image'+url.split('/', -1)[-1]
      urllib.request.urlretrieve(url, path.rstrip('\n'))
1

try this

with open("a_file.txt", "a") as textfile: # "a" to append string
    for filename in glob.iglob('*.html'):
        with open(filename) as f:
            soup = BeautifulSoup(f)
            links_with_text = [a['href'] for a in soup.find_all('a', href=True) if a.text]
            links_with_text = "\n".join(links_with_text)
            textfile.write(f"{filename}\n{links_with_text}\n")
uingtea
  • 6,002
  • 2
  • 26
  • 40
1

To have the filename at the top of each block, just add another .write() line as follows:

from bs4 import BeautifulSoup
import glob
import csv

links_with_text = []

with open("a_file.txt", "w") as textfile:
    for filename in glob.iglob('*.html'):
        textfile.write(f"{filename}:\n")
        
        with open(filename) as f:
            soup = BeautifulSoup(f)
            links_with_text = [a['href'] for a in soup.find_all('a', href=True) if a.text]
            
            for element in links_with_text:
                textfile.write(f"  {element}\n")        
Martin Evans
  • 45,791
  • 17
  • 81
  • 97