4

I have to modify this code so the scraping keeps only the links that contain a specific keyword. In my case I'm scraping a newspaper page to find news related to the term 'Brexit'.

example of target link

I've tried modifying the method parse_links so it only keeps the links (or 'a' tags), that contain 'Brexit' in them, but it doesn't seem to work.

Where should i place the condition?

import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse

class MultiThreadScraper:
 
    def __init__(self, base_url):
 
        self.base_url = base_url
        self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
        self.pool = ThreadPoolExecutor(max_workers=20)
        self.scraped_pages = set([])
        self.to_crawl = Queue(10)
        self.to_crawl.put(self.base_url)
 
    def parse_links(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.find_all('a', href=True)
        for link in links:
            url = link['href']
            if url.startswith('/') or url.startswith(self.root_url):
                url = urljoin(self.root_url, url)
                if url not in self.scraped_pages:
                    self.to_crawl.put(url)
 
    def scrape_info(self, html):
        return
 
    def post_scrape_callback(self, res):
        result = res.result()
        if result and result.status_code == 200:
            self.parse_links(result.text)
            self.scrape_info(result.text)
 
    def scrape_page(self, url):
        try:
            res = requests.get(url, timeout=(3, 30))
            return res
        except requests.RequestException:
            return
 
    def run_scraper(self):
        while True:
            try:
                target_url = self.to_crawl.get(timeout=60)
                if target_url not in self.scraped_pages:
                    print("Scraping URL: {}".format(target_url))
                    self.scraped_pages.add(target_url)
                    job = self.pool.submit(self.scrape_page, target_url)
                    job.add_done_callback(self.post_scrape_callback)
            except Empty:
                return
            except Exception as e:
                print(e)
                continue
if __name__ == '__main__':
    s = MultiThreadScraper("https://elpais.com/")
    s.run_scraper()
CarlosT
  • 181
  • 1
  • 18

3 Answers3

2

You need to import re module to get the specific text value.Try the below code.

import re
 links = soup.find_all('a', text=re.compile("Brexit"))

This should return links which contains only Brexit.

KunduK
  • 32,888
  • 5
  • 17
  • 41
  • @Kajal, I'm just curious, will this also work if the text has "brexit" in all lower case? In other words, would using `re.compile("Brexit")` be case sensitive? – chitown88 Feb 28 '19 at 13:54
  • 3
    re.compile() is case sensitive. But you can add re.IGNORECASE inside re.compile(). – KunduK Feb 28 '19 at 14:01
  • 1
    @chitown88 you can pass re.IGNORECASE to the flags param of search match or sub. https://stackoverflow.com/questions/500864/case-insensitive-regular-expression-without-re-compile – CarlosT Feb 28 '19 at 14:06
  • oh nice. I'll have to remember that. thanks Kajal and Carlos, that will be useful for me in the future. – chitown88 Feb 28 '19 at 14:09
  • @KajalKundu is there a method to limit the amount of links i get? or should i juts make an array? Lets say i just want 20 links – CarlosT Feb 28 '19 at 14:10
  • 1
    you can use `range(len(links))` and looping through it and check once it reach 20 jump out from loop using `break`. – KunduK Feb 28 '19 at 14:15
2

You can get text of the element by using method getText() and check, if string actually contain "Brexit":

if "Brexit" in link.getText().split():
     url = link["href"]
dzhere
  • 21
  • 4
1

I added a check in this function. See if that does the rick for you:

def parse_links(self, html):
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all('a', href=True)
    for link in links:
        if 'BREXIT' in link.text.upper():  #<------ new if statement
            url = link['href']
            if url.startswith('/') or url.startswith(self.root_url):
                url = urljoin(self.root_url, url)
                if url not in self.scraped_pages:
                    self.to_crawl.put(url)
chitown88
  • 27,527
  • 4
  • 30
  • 59