0

I need a parser for rbc. I found a solution, i'm not sure if the parser logic works, but the code from two years ago does not work. I don't know what is the problem here. What is the problem with json? Thanks.

import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from IPython import display

class rbc_parser:
    def __init__(self):
        pass
    def _get_url(self, param_dict: dict) -> str:
        url = 'https://www.rbc.ru/v10/search/ajax/?\
        project={0}&\
        category={1}&\
        dateFrom={2}&\
        dateTo={3}&\
        offset={4}&\
        limit={5}&\
        query={6}&\
        material={7}'.format(param_dict['project'],
                            param_dict['category'],
                            param_dict['dateFrom'],
                            param_dict['dateTo'],
                            param_dict['offset'],
                            param_dict['limit'],
                            param_dict['query'],
                            param_dict['material'])
        
        return url
    def _get_search_table(self, param_dict: dict,
                          includeText: bool = True) -> pd.DataFrame:
        url = self._get_url(param_dict)
        r = rq.get(url)
        search_table = pd.DataFrame(r.json()['items'])
        if includeText and not search_table.empty:
            get_text = lambda x: self._get_article_data(x['fronturl'])
            search_table[['overview', 'text']] = search_table.apply(get_text,
                                                                    axis=1).tolist()
            
        return search_table.sort_values('publish_date_t', ignore_index=True)
    
    def _get_article_data(self, url: str):

        r = rq.get(url)
        soup = bs(r.text, features="lxml") # features="lxml" чтобы не было warning
        div_overview = soup.find('div', {'class': 'article__text__overview'})
        if div_overview:
            overview = div_overview.text.replace('<br />','\n').strip()
        else:
            overview = None
        p_text = soup.find_all('p')
        if p_text:
            text = ' '.join(map(lambda x:
                                x.text.replace('<br />','\n').strip(),
                                p_text))
        else:
            text = None
        
        return overview, text 
    
    def get_articles(self,
                     param_dict,
                     time_step = 7,
                     save_every = 5,
                     save_excel = True) -> pd.DataFrame:

        param_copy = param_dict.copy()
        time_step = timedelta(days=time_step)
        dateFrom = datetime.strptime(param_copy['dateFrom'], '%d.%m.%Y')
        dateTo = datetime.strptime(param_copy['dateTo'], '%d.%m.%Y')
        if dateFrom > dateTo:
            raise ValueError('dateFrom should be less than dateTo')
        
        out = pd.DataFrame()
        save_counter = 0

        while dateFrom <= dateTo:
            param_copy['dateTo'] = (dateFrom + time_step).strftime("%d.%m.%Y")
            if dateFrom + time_step > dateTo:
                param_copy['dateTo'] = dateTo.strftime("%d.%m.%Y")
            print('Parsing articles from ' + param_copy['dateFrom'] +  ' to ' + param_copy['dateTo'])
            out = out.append(self._get_search_table(param_copy), ignore_index=True)
            dateFrom += time_step + timedelta(days=1)
            param_copy['dateFrom'] = dateFrom.strftime("%d.%m.%Y")
            save_counter += 1
            if save_counter == save_every:
                display.clear_output(wait=True)
                out.to_excel("/tmp/checkpoint_table.xlsx")
                print('Checkpoint saved!')
                save_counter = 0
        
        if save_excel:
            out.to_excel("rbc_{}_{}.xlsx".format(
                param_dict['dateFrom'],
                param_dict['dateTo']))
        print('Finish')
        return out

query = 'rbc'
project = "rbcnews"
category = "TopRbcRu_economics"
material = ""
dateFrom = '2021-01-01'
dateTo = "2021-02-28"
offset = 0
limit = 100

param_dict = {'query'   : query, 
                  'project' : project,
                  'category': category,
                  'dateFrom': datetime.
                  strptime(dateFrom, '%Y-%m-%d').
                  strftime('%d.%m.%Y'),
                  'dateTo'  : datetime.
                  strptime(dateTo, '%Y-%m-%d').
                  strftime('%d.%m.%Y'),
                  'offset'  : str(offset),
                  'limit'   : str(limit),
                  'material': material}

parser = rbc_parser()
tbl = parser._get_search_table(param_dict, includeText = True)
print(len(tbl))
tbl.head()

table = parser.get_articles(param_dict=param_dict,
                             time_step = 7,
                             save_every = 5,
                             save_excel = True)
print(len(table))
table.head()

Traceback:

  \PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\requests\models.py", line 900, in json
    return complexjson.loads(self.text, **kwargs)
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\json\__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\json\decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\json\decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

Unfortunately I don't understand what is the json error. Empty query question? Or perhaps there are working parsers? Unfortunately, this is one of the newest ones I've found.

  • The first step would be to print out the actual text of the response, before trying to decode it as JSON. My guess is that it contains an error message, due to your malformed URL (you're putting it a lot of unwanted spaces, from the indention on the following lines of the string). – jasonharper Mar 12 '23 at 18:14
  • Yes, I believe that is the problem. The request was not fully successful, therefore the response does not actually contain json, therefore `r.json()` fails. – John Gordon Mar 12 '23 at 18:42
  • Search for the error message, there are literally hundreds of similar questions here. Also, [tour] and [ask]. – Ulrich Eckhardt Mar 12 '23 at 20:24
  • Does this answer your question? [JSONDecodeError: Expecting value: line 1 column 1 (char 0)](https://stackoverflow.com/questions/16573332/jsondecodeerror-expecting-value-line-1-column-1-char-0) – Ulrich Eckhardt Mar 12 '23 at 20:24
  • Please trim your code to make it easier to find your problem. Follow these guidelines to create a [minimal reproducible example](https://stackoverflow.com/help/minimal-reproducible-example). – Community Mar 13 '23 at 00:50

1 Answers1

0

My bad, I trusted a link to a page that no longer exists and the Json is empty. It was only necessary to use the request

url = 'https://www.rbc.ru/search/ajax/?\