Following Links in HTML using BeautifulSoup

Question

I am doing a course which requires me to parse this using BeautifulSoup: http://python-data.dr-chuck.net/known_by_Fikret.html

The instructions are: Find the link at position 3 (the first name is 1). Follow that link. Repeat this process 4 times. The answer is the last name that you retrieve.

This is the code I have so far:

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import re

url = input('Enter - ')
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')

count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
urllist = list()
taglist = list()

tags = soup('a')

for i in range(count):
    for tag in tags:
        taglist.append(tag)
    url = taglist[pos].get('href', None)
    print('Retrieving: ', url)
    urllist.append(url)
print('Last URL: ', urllist[-1])

This is my output:

Retrieving:  http://python-data.dr-chuck.net/known_by_Fikret.html 
Retrieving:  http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving:  http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving:  http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving:  http://python-data.dr-chuck.net/known_by_Montgomery.html
Last URL:  http://python-data.dr-chuck.net/known_by_Montgomery.html

This is the output that I am supposed to get:

Retrieving: http://python-data.dr-chuck.net/known_by_Fikret.html
Retrieving: http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving: http://python-data.dr-chuck.net/known_by_Mhairade.html
Retrieving: http://python-data.dr-chuck.net/known_by_Butchi.html
Retrieving: http://python-data.dr-chuck.net/known_by_Anayah.html
Last URL:  http://python-data.dr-chuck.net/known_by_Anayah.html

I've been working on this for a while but I still have not been able to get the code to loop correctly. I am new to coding and I'm just looking for some help to point me in the right direction. Thanks.

Shaun Baker · Accepted Answer · 2017-02-08T16:03:52.717

def get_html(url):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup

url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1

urllist = list()

 for i in range(count):
    taglist = list()

    for tag in get_html(url)('a'): # Needed to update your variable to new url html
        taglist.append(tag)

     url = taglist[pos].get('href', None) # You grabbed url but never updated your tags variable.

    print('Retrieving: ', url)
    urllist.append(url)

 print('Last URL: ', urllist[-1])

score 1 · Answer 2 · edited Apr 22 '19 at 18:37

Try this way:

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup

url=input("Enter url:")

count=int(input('Enter count:'))
pos=int(input('Enter position:'))-1

urllist=list()

for i in range(count):
    html=urllib.request.urlopen(url)
    soup=BeautifulSoup(html,'html.parser')
    tags=soup('a')
    print('Retrieveing:',url)
    taglist=list()
    for tag in tags:
        y=tag.get('href',None)
        taglist.append(y)

    url=taglist[pos]

    urllist.append(url)

print("Last Url:",urllist[-2])

score 0 · Answer 3 · answered Feb 08 '17 at 14:26

0

You are getting the link at the same pos position multiple times. Use the i loop counter for the offset, replace:

url = taglist[pos].get('href', None)

with:

url = taglist[pos + i].get('href', None)

answered Feb 08 '17 at 14:26

alecxe

462,703
120
1,088
1,195

Thijs Verdouw · Answer 4 · 2017-02-08T14:50:29.810

The reason you do not get the proper answer is the following: You do not open the link.

After finding the right url in the first page you have to open the url you found with urllib.request.urlopen(URL).read(), and look for the new link there. You have to repeat this three times. I'd recommend a while loop for this.

this code does the trick:

url =  'http://python-data.dr-chuck.net/known_by_Fikret.html'
count = 5
pos = 2
urllist = []
taglist = []

connections = 0 
while connections < 5 : #you need to connect five times
    taglist = []
    print('Retrieving: ', url)
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')

    for i in range(count):
        for tag in tags:
            taglist.append(tag)

    url = taglist[pos].get('href', None)
    urllist.append(url)

    connections = connections + 1  
print ("last url:", url)

score 0 · Answer 5 · answered Oct 18 '19 at 12:34

Try this one:

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

def parse(url):
    count=0
    while count<7:
        html = urllib.request.urlopen(url, context=ctx).read()
        soup = BeautifulSoup(html, 'html.parser')
        list1=list()
        tags = soup('a')
        for tag in tags:
            list1.append(tag.get('href', None))
        url=list1[17]
        count+=1
        print ('Retreiving:',url)

print (parse('http://py4e-data.dr-chuck.net/known_by_Lorenz.html'))

That's my output:

Retreiving: http://py4e-data.dr-chuck.net/known_by_Cadyn.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Phebe.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Cullen.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Alessandro.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Gurveer.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Anureet.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Sandie.html
None

score 0 · Answer 6 · answered Feb 28 '20 at 07:02

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

urllist = list()
taglist = list()
url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
urllist = list()
for i in range(count):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags=soup('a')
    for tag in tags: 
    # the most important part is keep updating the variable of tags by putting in front of this loop
        taglist.append(tag)
    print('Retrieving: ', url)
    url = taglist[pos].get('href', None) 
    urllist.append(url)

print('Retrieving: ', urllist[-1])

score 0 · Answer 7 · answered Apr 13 '20 at 10:17

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl


# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

urllist = list()
url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1

for i in range(count):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags=soup('a')
    url = tags[pos].get('href', None) 
    print('Retrieving: ', url)
    urllist.append(url)


print('Retrieving: ', urllist[-1])

Why is this answer correct? Please edit your answer and explain :) — Orestis Zekai, Apr 13 '20 at 10:23

riccardopit · Answer 8 · 2023-03-28T20:38:17.637

For me it was necessary to make a change because of an error caused by a change in the library of python 3.10+. This is the link where i found the solution.

Error "AttributeError 'collections' has no attribute 'Callable' " using Beautiful Soup

With this solution it's not necessary to create a new list in which insert all the urls and then select the url that you need according to the "position" parameter. Imagine to have a page with 1 million of urls. Making a list of 1 million of urls and then select, for example, the tenth url, it's absolutely not necessary. So I create a counter (actpos) and once i reach the position defined I exit the loop immediately, I store the new url to be opened from "newurl" in the variable "myurl" and then I restart my loop again using the updated "myurl" variable. Everything happens for a number of times defined by the parameter "count".

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

#collections.Callable has been moved to collections.abc.Callable in python 3.10+.
#Added the reference back to collections before importing the problem library.
import collections
collections.Callable = collections.abc.Callable

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

def getsoup(url):
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup

myurl = input('Enter - ')
count = int(input('Enter count: '))
position = int(input('Enter position: '))

print('Retrieving: ',myurl)
for tag in range(count):
    actpos = 0  #actual position
    for newurl in getsoup(myurl)('a'):
        actpos = actpos + 1
        if actpos < position:
            continue
        break
    myurl = newurl.get('href', None)    #update url
    print('Retrieving: ',myurl)
print('Last url: ',myurl)

score 0 · Answer 9 · answered Jul 06 '23 at 23:21

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

ctx`enter code here` = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter URL: ')
count = int(input("Enter count: "))
pos = int(input("Enter position: "))

for i in range(count):
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    name = tags[pos-1].string
    url = tags[pos-1]['href']
    print("Retrieving: ", url)

print(name)

Your answer could be improved with additional supporting information. Please [edit] to add further details, such as citations or documentation, so that others can confirm that your answer is correct. You can find more information on how to write good answers [in the help center](/help/how-to-answer). — Community, Jul 09 '23 at 17:02

score 0 · Answer 10 · answered Jul 11 '23 at 13:32

easy and simple using a nested loop

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
count=input('Enter count: ')
position=input('Enter position: ')

while count!=0:
    end=0
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    for tag in tags:
        new_tag=tag.get('href',None)
        end+=1
        if position==end:
            print('Retrieving:',new_tag)
            url=new_tag
            break
    count-=1

score -1 · Answer 11 · answered Jun 15 '20 at 08:20

-1

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
url=input("Enter url:")
count=int(input('Enter count:'))
pos=int(input('Enter position:'))-1
urllist=list()
for i in range(count):
    html=urllib.request.urlopen(url)
    soup=BeautifulSoup(html,'html.parser')
    tags=soup('a')
    print('Retrieveing:',url)
    taglist=list()
    for tag in tags:
        y=tag.get('href',None)
        taglist.append(y)
    url=taglist[pos]
    urllist.append(url)
x=len(urllist)
print("Last Url:",urllist[x-1])

answered Jun 15 '20 at 08:20

Atul Sharma

1

Use this x=len(urllist) print("Last Url:",urllist[x-1]) – Atul Sharma Jun 15 '20 at 08:22
could you add any context to your solution? [answer] – dboy Jun 15 '20 at 09:06

score -1 · Answer 12 · edited Dec 13 '22 at 15:59

#assignment2
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

count = 7
position = 18

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

#run1

idea = ['https://py4e-data.dr-chuck.net/known_by_Lynn.html']

empty = []

for i in range(count+1):
    url = idea[len(idea)-1]
    print("retrieving:", url)
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    
    for tag in tags:
        empty.append(tag.get('href',None))
        
        
    idea.append(empty[position-1])
    empty.clear()

please add more explanation for others to understand your answer better — Simas Joneliunas, Dec 15 '22 at 12:03

Following Links in HTML using BeautifulSoup

12 Answers12