I would recommend that you review the newspaper overview document that I published on GitHub. The document has multiple extraction examples and other techniques that might be useful.
Concerning your question...
Newspaper3K will parse certain websites nearly flawlessly. But there are plenty of websites that will require reviewing a page's navigational structure to determine how to parse the article elements correctly.
For instance, https://www.marketwatch.com has individual article elements, such as title, publish date and others items stored within the meta tag section of the page.
The newspaper example below will parse the elements correctly. I noted that you might need to do some data cleaning of the keyword or tag output.
import newspaper
from newspaper import Config
from newspaper import Article
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
base_url = 'https://www.marketwatch.com'
article_urls = set()
marketwatch = newspaper.build(base_url, config=config, memoize_articles=False, language='en')
for sub_article in marketwatch.articles:
article = Article(sub_article.url, config=config, memoize_articles=False, language='en')
article.download()
article.parse()
if article.url not in article_urls:
article_urls.add(article.url)
# The majority of the article elements are located
# within the meta data section of the page's
# navigational structure
article_meta_data = article.meta_data
published_date = {value for (key, value) in article_meta_data.items() if key == 'parsely-pub-date'}
article_published_date = " ".join(str(x) for x in published_date)
authors = sorted({value for (key, value) in article_meta_data.items() if key == 'parsely-author'})
article_author = ', '.join(authors)
title = {value for (key, value) in article_meta_data.items() if key == 'parsely-title'}
article_title = " ".join(str(x) for x in title)
keywords = ''.join({value for (key, value) in article_meta_data.items() if key == 'keywords'})
keywords_list = sorted(keywords.lower().split(','))
article_keywords = ', '.join(keywords_list)
tags = ''.join({value for (key, value) in article_meta_data.items() if key == 'parsely-tags'})
tag_list = sorted(tags.lower().split(','))
article_tags = ', '.join(tag_list)
summary = {value for (key, value) in article_meta_data.items() if key == 'description'}
article_summary = " ".join(str(x) for x in summary)
# the replace is used to remove newlines
article_text = article.text.replace('\n', '')
print(article_text)
https://www.euronews.com is similar to https://www.marketwatch.com,
except some of the article elements are located in the main body and other items are within the meta tag section.
import newspaper
from newspaper import Config
from newspaper import Article
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
base_url = 'https://www.euronews.com'
article_urls = set()
euronews = newspaper.build(base_url, config=config, memoize_articles=False, language='en')
for sub_article in euronews.articles:
if sub_article.url not in article_urls:
article_urls.add(sub_article.url)
article = Article(sub_article.url, config=config, memoize_articles=False, language='en')
article.download()
article.parse()
# The majority of the article elements are located
# within the meta data section of the page's
# navigational structure
article_meta_data = article.meta_data
published_date = {value for (key, value) in article_meta_data.items() if key == 'date.created'}
article_published_date = " ".join(str(x) for x in published_date)
article_title = article.title
summary = {value for (key, value) in article_meta_data.items() if key == 'description'}
article_summary = " ".join(str(x) for x in summary)
keywords = ''.join({value for (key, value) in article_meta_data.items() if key == 'keywords'})
keywords_list = sorted(keywords.lower().split(','))
article_keywords = ', '.join(keywords_list).strip()
# the replace is used to remove newlines
article_text = article.text.replace('\n', '')