I need a parser for rbc. I found a solution, i'm not sure if the parser logic works, but the code from two years ago does not work. I don't know what is the problem here. What is the problem with json? Thanks.
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from IPython import display
class rbc_parser:
def __init__(self):
pass
def _get_url(self, param_dict: dict) -> str:
url = 'https://www.rbc.ru/v10/search/ajax/?\
project={0}&\
category={1}&\
dateFrom={2}&\
dateTo={3}&\
offset={4}&\
limit={5}&\
query={6}&\
material={7}'.format(param_dict['project'],
param_dict['category'],
param_dict['dateFrom'],
param_dict['dateTo'],
param_dict['offset'],
param_dict['limit'],
param_dict['query'],
param_dict['material'])
return url
def _get_search_table(self, param_dict: dict,
includeText: bool = True) -> pd.DataFrame:
url = self._get_url(param_dict)
r = rq.get(url)
search_table = pd.DataFrame(r.json()['items'])
if includeText and not search_table.empty:
get_text = lambda x: self._get_article_data(x['fronturl'])
search_table[['overview', 'text']] = search_table.apply(get_text,
axis=1).tolist()
return search_table.sort_values('publish_date_t', ignore_index=True)
def _get_article_data(self, url: str):
r = rq.get(url)
soup = bs(r.text, features="lxml") # features="lxml" чтобы не было warning
div_overview = soup.find('div', {'class': 'article__text__overview'})
if div_overview:
overview = div_overview.text.replace('<br />','\n').strip()
else:
overview = None
p_text = soup.find_all('p')
if p_text:
text = ' '.join(map(lambda x:
x.text.replace('<br />','\n').strip(),
p_text))
else:
text = None
return overview, text
def get_articles(self,
param_dict,
time_step = 7,
save_every = 5,
save_excel = True) -> pd.DataFrame:
param_copy = param_dict.copy()
time_step = timedelta(days=time_step)
dateFrom = datetime.strptime(param_copy['dateFrom'], '%d.%m.%Y')
dateTo = datetime.strptime(param_copy['dateTo'], '%d.%m.%Y')
if dateFrom > dateTo:
raise ValueError('dateFrom should be less than dateTo')
out = pd.DataFrame()
save_counter = 0
while dateFrom <= dateTo:
param_copy['dateTo'] = (dateFrom + time_step).strftime("%d.%m.%Y")
if dateFrom + time_step > dateTo:
param_copy['dateTo'] = dateTo.strftime("%d.%m.%Y")
print('Parsing articles from ' + param_copy['dateFrom'] + ' to ' + param_copy['dateTo'])
out = out.append(self._get_search_table(param_copy), ignore_index=True)
dateFrom += time_step + timedelta(days=1)
param_copy['dateFrom'] = dateFrom.strftime("%d.%m.%Y")
save_counter += 1
if save_counter == save_every:
display.clear_output(wait=True)
out.to_excel("/tmp/checkpoint_table.xlsx")
print('Checkpoint saved!')
save_counter = 0
if save_excel:
out.to_excel("rbc_{}_{}.xlsx".format(
param_dict['dateFrom'],
param_dict['dateTo']))
print('Finish')
return out
query = 'rbc'
project = "rbcnews"
category = "TopRbcRu_economics"
material = ""
dateFrom = '2021-01-01'
dateTo = "2021-02-28"
offset = 0
limit = 100
param_dict = {'query' : query,
'project' : project,
'category': category,
'dateFrom': datetime.
strptime(dateFrom, '%Y-%m-%d').
strftime('%d.%m.%Y'),
'dateTo' : datetime.
strptime(dateTo, '%Y-%m-%d').
strftime('%d.%m.%Y'),
'offset' : str(offset),
'limit' : str(limit),
'material': material}
parser = rbc_parser()
tbl = parser._get_search_table(param_dict, includeText = True)
print(len(tbl))
tbl.head()
table = parser.get_articles(param_dict=param_dict,
time_step = 7,
save_every = 5,
save_excel = True)
print(len(table))
table.head()
Traceback:
\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\models.py", line 900, in json
return complexjson.loads(self.text, **kwargs)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Unfortunately I don't understand what is the json error. Empty query question? Or perhaps there are working parsers? Unfortunately, this is one of the newest ones I've found.