-1
#Import Needed Libraries    
import requests
from bs4 import BeautifulSoup 
import pprint


res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.select('.titlelink')
subtext = soup.select('.subtext')


def sort_stories_by_votes(hnlist):  #Sorting your create_custom_hn dict by votes(if)
    return sorted(hnlist, key= lambda k:k['votes'], reverse=True)


def create_custom_hn(links, subtext): #Creates a list of links and subtext
    hn = []
    for idx, item in enumerate(links): #Need to use this because not every link has a lot of votes
        title = links[idx].getText()
        href = links[idx].get('href', None)
        vote = subtext[idx].select('.score')
        if len(vote):
            points = int(vote[0].getText().replace(' points', ''))
            if points > 99:  #Only appends stories that are over 100 points
                hn.append({'title': title, 'link': href, 'votes': points})
    return sort_stories_by_votes(hn)


pprint.pprint(create_custom_hn(links, subtext))

My question is that this is only for the first page, which has only 30 stories.

How would I apply my web scraping method by going through each page.... let's say the next 10 pages and keeping the formatted code above?

Ram
  • 4,724
  • 2
  • 14
  • 22
TimMTech93
  • 9
  • 1
  • 4
  • Would I need to put this entire code in a for loop with a range from 1-20? Then using the .format method? – TimMTech93 Oct 26 '21 at 21:56
  • Have you tried putting it in a loop using the .format method with range from 1-20? I tried it and it works for me – chickity china chinese chicken Oct 27 '21 at 05:24
  • e.g. wrap your code in `for i in range(20): res = requests.get('https://news.ycombinator.com/news?p={page}'.format(page=i))` same as [How can I loop scraping data for multiple pages in a website using python and beautifulsoup4](https://stackoverflow.com/questions/31062435/how-can-i-loop-scraping-data-for-multiple-pages-in-a-website-using-python-and-be) – chickity china chinese chicken Oct 27 '21 at 06:37

1 Answers1

0

The URL for each page is like this

https://news.ycombinator.com/news?p=<page_number>

Use a for-loop to scrape content from each page. See the code below.

Here is the code that prints the contents from the first two pages. You can change the page_no depending on your need.

import requests
from bs4 import BeautifulSoup 
import pprint

def sort_stories_by_votes(hnlist):  #Sorting your create_custom_hn dict by votes(if)
    return sorted(hnlist, key= lambda k:k['votes'], reverse=True)


def create_custom_hn(links, subtext, page_no): #Creates a list of links and subtext
    hn = []
    for idx, item in enumerate(links): #Need to use this because not every link has a lot of votes
        title = links[idx].getText()
        href = links[idx].get('href', None)
        vote = subtext[idx].select('.score')
        if len(vote):
            points = int(vote[0].getText().replace(' points', ''))
            if points > 99:  #Only appends stories that are over 100 points
                hn.append({'title': title, 'link': href, 'votes': points})
    return sort_stories_by_votes(hn)


for page_no in range(1,3):
    print(f'Page: {page_no}')
    url = f'https://news.ycombinator.com/news?p={page_no}'
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    links = soup.select('.titlelink')
    subtext = soup.select('.subtext')
    pprint.pprint(create_custom_hn(links, subtext, page_no))
Page: 1

[{'link': 'https://www.thisworddoesnotexist.com/',
  'title': 'This word does not exist',
  'votes': 904},
 {'link': 'https://www.sparkfun.com/news/3970',
  'title': 'A patent troll backs off',
  'votes': 662},
.
.

Page: 2

[{'link': 'https://www.vice.com/en/article/m7vqkv/how-fbi-gets-phone-data-att-tmobile-verizon',
  'title': "The FBI's internal guide for getting data from AT&T, T-Mobile, "
           'Verizon',
  'votes': 802},
 {'link': 'https://www.dailymail.co.uk/news/article-10063665/Government-orders-Google-track-searching-certain-names-addresses-phone-numbers.html',
  'title': 'Feds order Google to track people searching certain names or '
           'details',
  'votes': 733},
.
.
Ram
  • 4,724
  • 2
  • 14
  • 22