0

I am running a python script in Jupyter Notebook that connects to a mongo db to process text documents (news articles). The script runs fine for the first few batches of data, but then terminates with the following error:

AutoReconnect: localhost:27017: [WinError 10054] An existing connection was forcibly closed by the remote host (full error below).

Have already tried deleting the mongod.lock file as prescribed here(Pymongo keeps refusing the connection at 27017), however, this hasn't solved the issue.

This is my function (with a few sub-functions as well not included here):

    data_list = []
    for collection in mongo_collections_dir:
        mongo_collection = mongo_collections_dir[collection]
        filter_dict = {"file_info.source": source}
        if filter_year:
            filter_dict["extracted.publication_date.year"] = filter_year
        elif min_year:
            print('filter year not found') # - note added by dror for debug 24/10/2019
            filter_dict["extracted.publication_date.year"] = {"$gt": min_year}
        source_count = mongo_collection.count(filter_dict)
        print("{} articles found in collection {} {}".format(source_count, collection, filter_year))
        if source_count == "0":
            continue
        docs = mongo_collection.find(filter_dict, no_cursor_timeout=True)
        if not docs:
            print("source {} was not found in collection {}".format(source, collection))
            continue
        for pos, doc in enumerate(docs):
            if pos % 100000 == 0:
                print("processed {} articles out of {} from {}".format(pos, source_count, source))
            try:
                text = doc["body"]["content"]
            except KeyError:
#                 print('no body')
                continue  
            if clean_text:
                clean_text = mpd_clean_text(text, stop_words)
            else:
                clean_text = ''
            try:
                title = doc['body']['head']['hedline']
#                 author = doc['body']['head']['byline']
                temp_dir = {"collection": mongo_collection.name, "source": doc["file_info"]["source"], 
                            "urn": doc["urn"],  "title": title,
                            'unit_text': text, 'clean text': clean_text, 
                           }
            except KeyError:
                temp_dir = {"collection": mongo_collection.name, "source": doc["file_info"]["source"], "urn": doc["urn"],
                           'unit_text': text, 'clean text': clean_text}  

            try:
                publication_date = get_dt(doc["extracted"]["publication_date"])
                temp_dir['publication_date'] = publication_date
            except KeyError:
                print('no extracted')
            try:
                temp_dir['section'] = doc['extracted']['section']
            except KeyError:
                pass
            try:
                temp_dir['publication_name'] = doc['extracted']['publication_name']
            except KeyError:
                pass

            if temp_dir:
#                 temp_dir['section'] = section
                data_list.append(temp_dir)
#     df = pd.DataFrame(data_list)
#     df['section'] = section
    return pd.DataFrame(data_list)

Full error:

~\Anaconda3\envs\py35\lib\site-packages\pymongo\pool.py in command(self, dbname, spec, slave_ok, read_preference, codec_options, check, allowable_errors, check_keys, read_concern, write_concern, parse_write_concern_error, collation, session, client, retryable_write, publish_events)
    578                            use_op_msg=self.op_msg_enabled,
--> 579                            unacknowledged=unacknowledged)
    580         except OperationFailure:

~\Anaconda3\envs\py35\lib\site-packages\pymongo\network.py in command(sock, dbname, spec, slave_ok, is_mongos, read_preference, codec_options, session, client, check, allowable_errors, address, check_keys, listeners, max_bson_size, read_concern, parse_write_concern_error, collation, compression_ctx, use_op_msg, unacknowledged)
    140         else:
--> 141             reply = receive_message(sock, request_id)
    142             unpacked_docs = reply.unpack_response(codec_options=codec_options)

~\Anaconda3\envs\py35\lib\site-packages\pymongo\network.py in receive_message(sock, request_id, max_message_size)
    172     length, _, response_to, op_code = _UNPACK_HEADER(
--> 173         _receive_data_on_socket(sock, 16))
    174     # No request_id for exhaust cursor "getMore".

~\Anaconda3\envs\py35\lib\site-packages\pymongo\network.py in _receive_data_on_socket(sock, length)
    231             try:
--> 232                 chunk_length = sock.recv_into(mv[bytes_read:])
    233             except (IOError, OSError) as exc:

ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

AutoReconnect                             Traceback (most recent call last)
<ipython-input-14-37a7effc9859> in <module>
----> 1 collect_df_by_sources(collections_dir, sources, newspapers_df, filter_year='2008')

<ipython-input-8-bbd9e966e95c> in collect_df_by_sources(collections_dir, sources, newspapers_df, return_df, filter_year, min_year, override)
     20 
     21         print("collect articles from: {}, {}".format(source_name, source_id))
---> 22         source_df = collect_df_by_source(collections_dir, source_id, filter_year=filter_year, min_year=min_year)
     23         source_df.to_csv(source_path, encoding='utf8')
     24         if return_df:

<ipython-input-7-82d129ff329c> in collect_df_by_source(mongo_collections_dir, source, clean_text, filter_year, min_year)
      9             print('filter year not found') # - note added by dror for debug 24/10/2019
     10             filter_dict["extracted.publication_date.year"] = {"$gt": min_year}
---> 11         source_count = mongo_collection.count(filter_dict)
     12         print("{} articles found in collection {} {}".format(source_count, collection, filter_year))
     13         if source_count == "0":

~\Anaconda3\envs\py35\lib\site-packages\pymongo\collection.py in count(self, filter, session, **kwargs)
   1764         collation = validate_collation_or_none(kwargs.pop('collation', None))
   1765         cmd.update(kwargs)
-> 1766         return self._count(cmd, collation, session)
   1767 
   1768     def create_indexes(self, indexes, session=None, **kwargs):

~\Anaconda3\envs\py35\lib\site-packages\pymongo\collection.py in _count(self, cmd, collation, session)
   1570                 read_concern=self.read_concern,
   1571                 collation=collation,
-> 1572                 session=session)
   1573         if res.get("errmsg", "") == "ns missing":
   1574             return 0

~\Anaconda3\envs\py35\lib\site-packages\pymongo\collection.py in _command(self, sock_info, command, slave_ok, read_preference, codec_options, check, allowable_errors, read_concern, write_concern, collation, session, retryable_write)
    242                 session=s,
    243                 client=self.__database.client,
--> 244                 retryable_write=retryable_write)
    245 
    246     def __create(self, options, collation, session):

~\Anaconda3\envs\py35\lib\site-packages\pymongo\pool.py in command(self, dbname, spec, slave_ok, read_preference, codec_options, check, allowable_errors, check_keys, read_concern, write_concern, parse_write_concern_error, collation, session, client, retryable_write, publish_events)
    582         # Catch socket.error, KeyboardInterrupt, etc. and close ourselves.
    583         except BaseException as error:
--> 584             self._raise_connection_failure(error)
    585 
    586     def send_message(self, message, max_doc_size):

~\Anaconda3\envs\py35\lib\site-packages\pymongo\pool.py in _raise_connection_failure(self, error)
    741         self.close()
    742         if isinstance(error, socket.error):
--> 743             _raise_connection_failure(self.address, error)
    744         else:
    745             raise error

~\Anaconda3\envs\py35\lib\site-packages\pymongo\pool.py in _raise_connection_failure(address, error, msg_prefix)
    281         raise NetworkTimeout(msg)
    282     else:
--> 283         raise AutoReconnect(msg)
    284 
    285 

AutoReconnect: localhost:27017: [WinError 10054] An existing connection was forcibly closed by the remote host```


Dror M
  • 63
  • 8
  • Questions seeking debugging help (**"why isn't this code working?"**) must include the desired behavior, a *specific problem or error and the shortest code necessary* to reproduce it **in the question itself**. Questions without a **clear problem statement** are not useful to other readers. See: [How to create a Minimal, Complete, and Verifiable example.](http://stackoverflow.com/help/mcve) – Neil Lunn Oct 26 '19 at 12:52
  • You're chasing red herrings here. Nothing to do with the server and nothing to do with the library. It's **your code** which is the thing at fault here. Read the link and provide the **minimal** code in order to reproduce the problem. Or, in the process of doing that, instead realize what you did wrong. Which again is kind of the point of providing the reproducible example. – Neil Lunn Oct 26 '19 at 12:55

0 Answers0