A am learning how to use BeautifulSoup and I have run into an issue with double printing in a loop I have written.
Any insight would be greatly appreciated!
from bs4 import BeautifulSoup
import requests
import re
page = 'https://news.google.com/news/headlines?gl=US&ned=us&hl=en' #main page
#url = raw_input("Enter a website to extract the URL's from: ")
r = requests.get(page) #requests html document
data = r.text #set data = to html text
soup = BeautifulSoup(data, "html.parser") #parse data with BS
for link in soup.find_all('a'):
#if contains /news/
if ('/news/' in link.get('href')):
print(link.get('href'))
Examples:
for link in soup.find_all('a'):
#if contains cointelegraph/news/
#if ('https://cointelegraph.com/news/' in link.get('href')):
url = link.get('href') #local var store url
if '/news/' in url:
print(url)
print(count)
count += 1
if count == 5:
break
output:
https://cointelegraph.com/news/woman-in-denmark-imprisoned-for-hiring-hitman-using-bitcoin
0
https://cointelegraph.com/news/ethereum-price-hits-all-time-high-of-750-following-speed-boost
1
https://cointelegraph.com/news/ethereum-price-hits-all-time-high-of-750-following-speed-boost
2
https://cointelegraph.com/news/senior-vp-says-ebay-seriously-considering-bitcoin-integration
3
https://cointelegraph.com/news/senior-vp-says-ebay-seriously-considering-bitcoin-integration
4
For some reason my code keeps printing out the same url twice...