You could do the whole thing with beautifulsoup as requests. The text extraction code is by @nmgeek; the same question there has other methods to choose from. I am guessing you can then handle the text with nltk. The method is nice as you can play with which selectors you add to list. You can achieve something similar with selector list passed to select
i.e. [item.text for item in soup.select('selector list goes here')
Edit: Below gets you all the links but seems website blocks you after a while. Have a look at rotating IPs and these/User-Agents in the loop over all_links
If you have to resort to selenium at least you have the list of all article links to you can loop over and .get
with selenium
import requests
from bs4 import BeautifulSoup as bs
url = 'https://teonite.com/blog/page/{}/index.html'
all_links = []
headers = {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent' : 'Mozilla/5.0'
}
with requests.Session() as s:
r = s.get('https://teonite.com/blog/')
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
num_pages = int(soup.select_one('.page-number').text.split('/')[1])
for page in range(2, num_pages + 1):
r = s.get(url.format(page))
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
all_links = [item for i in all_links for item in i]
for article in all_links:
#print(article)
r = s.get(article, headers = headers)
soup = bs(r.content, 'lxml')
[t.extract() for t in soup(['style', 'script', '[document]', 'head', 'title'])]
visible_text = soup.getText() # taken from https://stackoverflow.com/a/19760007/6241235 @nmgeek
# here I think you need to consider IP rotation/User-Agent changing
try:
print(soup.select_one('.post-title').text)
except:
print(article)
print(soup.select_one('h1').text)
break
# do something with text
Adding in selenium seems to definitely solve bad request problem of being blocked:
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
url = 'https://teonite.com/blog/page/{}/index.html'
all_links = []
with requests.Session() as s:
r = s.get('https://teonite.com/blog/')
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
num_pages = int(soup.select_one('.page-number').text.split('/')[1])
for page in range(2, num_pages + 1):
r = s.get(url.format(page))
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
all_links = [item for i in all_links for item in i]
d = webdriver.Chrome()
for article in all_links:
d.get(article)
soup = bs(d.page_source, 'lxml')
[t.extract() for t in soup(['style', 'script', '[document]', 'head', 'title'])]
visible_text = soup.getText() # taken from https://stackoverflow.com/a/19760007/6241235 @nmgeek
try:
print(soup.select_one('.post-title').text)
except:
print(article)
print(soup.select_one('h1').text)
break #for debugging
# do something with text
d.quit()