0

I am doing a course which requires me to parse this using BeautifulSoup: http://python-data.dr-chuck.net/known_by_Fikret.html

The instructions are: Find the link at position 3 (the first name is 1). Follow that link. Repeat this process 4 times. The answer is the last name that you retrieve.

This is the code I have so far:

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import re

url = input('Enter - ')
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')

count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
urllist = list()
taglist = list()

tags = soup('a')

for i in range(count):
    for tag in tags:
        taglist.append(tag)
    url = taglist[pos].get('href', None)
    print('Retrieving: ', url)
    urllist.append(url)
print('Last URL: ', urllist[-1])

This is my output:

Retrieving:  http://python-data.dr-chuck.net/known_by_Fikret.html 
Retrieving:  http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving:  http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving:  http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving:  http://python-data.dr-chuck.net/known_by_Montgomery.html
Last URL:  http://python-data.dr-chuck.net/known_by_Montgomery.html

This is the output that I am supposed to get:

Retrieving: http://python-data.dr-chuck.net/known_by_Fikret.html
Retrieving: http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving: http://python-data.dr-chuck.net/known_by_Mhairade.html
Retrieving: http://python-data.dr-chuck.net/known_by_Butchi.html
Retrieving: http://python-data.dr-chuck.net/known_by_Anayah.html
Last URL:  http://python-data.dr-chuck.net/known_by_Anayah.html

I've been working on this for a while but I still have not been able to get the code to loop correctly. I am new to coding and I'm just looking for some help to point me in the right direction. Thanks.

century530
  • 23
  • 1
  • 2

12 Answers12

1
def get_html(url):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup

url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1

urllist = list()

 for i in range(count):
    taglist = list()

    for tag in get_html(url)('a'): # Needed to update your variable to new url html
        taglist.append(tag)

     url = taglist[pos].get('href', None) # You grabbed url but never updated your tags variable.

    print('Retrieving: ', url)
    urllist.append(url)

 print('Last URL: ', urllist[-1])
1

Try this way:

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup

url=input("Enter url:")

count=int(input('Enter count:'))
pos=int(input('Enter position:'))-1

urllist=list()

for i in range(count):
    html=urllib.request.urlopen(url)
    soup=BeautifulSoup(html,'html.parser')
    tags=soup('a')
    print('Retrieveing:',url)
    taglist=list()
    for tag in tags:
        y=tag.get('href',None)
        taglist.append(y)

    url=taglist[pos]

    urllist.append(url)

print("Last Url:",urllist[-2])
Pingolin
  • 3,161
  • 6
  • 25
  • 40
0

You are getting the link at the same pos position multiple times. Use the i loop counter for the offset, replace:

url = taglist[pos].get('href', None)

with:

url = taglist[pos + i].get('href', None)
alecxe
  • 462,703
  • 120
  • 1,088
  • 1,195
0

The reason you do not get the proper answer is the following: You do not open the link.

After finding the right url in the first page you have to open the url you found with urllib.request.urlopen(URL).read(), and look for the new link there. You have to repeat this three times. I'd recommend a while loop for this.

this code does the trick:

url =  'http://python-data.dr-chuck.net/known_by_Fikret.html'
count = 5
pos = 2
urllist = []
taglist = []

connections = 0 
while connections < 5 : #you need to connect five times
    taglist = []
    print('Retrieving: ', url)
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')

    for i in range(count):
        for tag in tags:
            taglist.append(tag)

    url = taglist[pos].get('href', None)
    urllist.append(url)

    connections = connections + 1  
print ("last url:", url)
0

Try this one:

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

def parse(url):
    count=0
    while count<7:
        html = urllib.request.urlopen(url, context=ctx).read()
        soup = BeautifulSoup(html, 'html.parser')
        list1=list()
        tags = soup('a')
        for tag in tags:
            list1.append(tag.get('href', None))
        url=list1[17]
        count+=1
        print ('Retreiving:',url)

print (parse('http://py4e-data.dr-chuck.net/known_by_Lorenz.html'))    

That's my output:

Retreiving: http://py4e-data.dr-chuck.net/known_by_Cadyn.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Phebe.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Cullen.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Alessandro.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Gurveer.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Anureet.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Sandie.html
None
0
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

urllist = list()
taglist = list()
url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
urllist = list()
for i in range(count):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags=soup('a')
    for tag in tags: 
    # the most important part is keep updating the variable of tags by putting in front of this loop
        taglist.append(tag)
    print('Retrieving: ', url)
    url = taglist[pos].get('href', None) 
    urllist.append(url)

print('Retrieving: ', urllist[-1])
LazyMark2
  • 25
  • 3
0
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl


# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

urllist = list()
url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1

for i in range(count):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags=soup('a')
    url = tags[pos].get('href', None) 
    print('Retrieving: ', url)
    urllist.append(url)


print('Retrieving: ', urllist[-1])

MdxSh
  • 1
  • 3
0

For me it was necessary to make a change because of an error caused by a change in the library of python 3.10+. This is the link where i found the solution.

Error "AttributeError 'collections' has no attribute 'Callable' " using Beautiful Soup

With this solution it's not necessary to create a new list in which insert all the urls and then select the url that you need according to the "position" parameter. Imagine to have a page with 1 million of urls. Making a list of 1 million of urls and then select, for example, the tenth url, it's absolutely not necessary. So I create a counter (actpos) and once i reach the position defined I exit the loop immediately, I store the new url to be opened from "newurl" in the variable "myurl" and then I restart my loop again using the updated "myurl" variable. Everything happens for a number of times defined by the parameter "count".

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

#collections.Callable has been moved to collections.abc.Callable in python 3.10+.
#Added the reference back to collections before importing the problem library.
import collections
collections.Callable = collections.abc.Callable

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

def getsoup(url):
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup

myurl = input('Enter - ')
count = int(input('Enter count: '))
position = int(input('Enter position: '))

print('Retrieving: ',myurl)
for tag in range(count):
    actpos = 0  #actual position
    for newurl in getsoup(myurl)('a'):
        actpos = actpos + 1
        if actpos < position:
            continue
        break
    myurl = newurl.get('href', None)    #update url
    print('Retrieving: ',myurl)
print('Last url: ',myurl)
0
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

ctx`enter code here` = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter URL: ')
count = int(input("Enter count: "))
pos = int(input("Enter position: "))

for i in range(count):
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    name = tags[pos-1].string
    url = tags[pos-1]['href']
    print("Retrieving: ", url)

print(name)
  • 1
    Your answer could be improved with additional supporting information. Please [edit] to add further details, such as citations or documentation, so that others can confirm that your answer is correct. You can find more information on how to write good answers [in the help center](/help/how-to-answer). – Community Jul 09 '23 at 17:02
0

easy and simple using a nested loop

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
count=input('Enter count: ')
position=input('Enter position: ')

while count!=0:
    end=0
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    for tag in tags:
        new_tag=tag.get('href',None)
        end+=1
        if position==end:
            print('Retrieving:',new_tag)
            url=new_tag
            break
    count-=1
-1
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
url=input("Enter url:")
count=int(input('Enter count:'))
pos=int(input('Enter position:'))-1
urllist=list()
for i in range(count):
    html=urllib.request.urlopen(url)
    soup=BeautifulSoup(html,'html.parser')
    tags=soup('a')
    print('Retrieveing:',url)
    taglist=list()
    for tag in tags:
        y=tag.get('href',None)
        taglist.append(y)
    url=taglist[pos]
    urllist.append(url)
x=len(urllist)
print("Last Url:",urllist[x-1])
-1
#assignment2
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

count = 7
position = 18

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

#run1

idea = ['https://py4e-data.dr-chuck.net/known_by_Lynn.html']

empty = []

for i in range(count+1):
    url = idea[len(idea)-1]
    print("retrieving:", url)
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    
    for tag in tags:
        empty.append(tag.get('href',None))
        
        
    idea.append(empty[position-1])
    empty.clear()
Nallath
  • 2,100
  • 20
  • 37