1

I'm trying to use Airflow to schedule web scraping tasks, dumping the results to MongoDB on my local machine. I am using the puckel/docker-airflow image, modifying it to include MongoDB as an additional service.

I have tried various solutions posted on here, including:

  1. Use links/networks;
  2. Use the mongo container name with MongoClient;
  3. Set the mongod.conf bind_ip to 0.0.0.0 and map the mongo container to port 172.17.0.1;

but I still face the same problem.

I'm doing something wrong but I'm not entirely sure what it is.

Here is the modified docker-compose file:

version: '3'                                                                      
services:                                                                                 
    postgres:                                                                     
        image: postgres:9.6                                                       
        environment:                                                              
            - POSTGRES_USER=airflow                                               
            - POSTGRES_PASSWORD=airflow                                           
            - POSTGRES_DB=airflow                                                 
        # link to common network                                                                               
        networks:                                                                 
          - app_tier                                                              

    # Custom mongo db                                                             
    mongo:                                                                                                     
        image: mongo:3.6.3                                                                                     
        restart: always                                                                                        
        volumes:                                                                                               
            - /data/db:/data/db                                                                                
        ports:                                                                                                 
            - "172.17.0.1:27017:27017"                                                                           
        networks:                                                                                              
            - app_tier                                                                                         

    webserver:                                                                                                 
        image: puckel/docker-airflow:1.10.2                                                                    
        restart: always                                                                                        
        depends_on:                                                                                            
            - postgres                                                                                         
        environment:                                                                                           
            - LOAD_EX=n                                                                                        
            - EXECUTOR=Local                                                                                   
        volumes:                                                                                               
            - ./dags:/usr/local/airflow/dags                                                                   
            # Uncomment to include custom plugins                               
            # - ./plugins:/usr/local/airflow/plugins                            
            # Custom python package                                                                            
            - ./requirements.txt:/requirements.txt                              
            # FIFA file path                                                                                   
            - ~/FIFA:/FIFA                                                                                     
            # Mongo DB path                                                                                    
            - /data/db:/data/db                                                                                
        # link to common network                                                                               
        networks:                                                                                              
            - app_tier                                                                                         
        ports:                                                                                                 
            - "8080:8080"                                                                                      
        command: webserver                                                                                     
        healthcheck:                                                                                           
            test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"]
            interval: 30s                                                                                      
            timeout: 30s                                                                                       
            retries: 3                                                                                         

networks:                                                                                                      
  app_tier:                                                                                                    
    driver: bridge 

I am using mongodb://mongo:27017 to connect to MongoDB.

In my logs, I get the following error:

pymongo.errors.ServerSelectionTimeoutError: localhost:27017: [Errno 111] Connection refused

Any idea what I am doing wrong?

TIA!

NOTE: I have looked at the answers in this section:

From inside of a Docker container, how do I connect to the localhost of the machine?

but I am having difficulty implementing it in a docker-compose file.

Running the individual containers is challenging, as the entrypoint.sh script for the puckel/docker-airflow image depends on postgres running (and I don't know how to make it run the same way on my local machine). Even so, running each service individually is a bit tedious. I tried running a personal python image and successfully dumped the results from the container into my local machine, but I don't know how to do the same with the puckel/docker-airflow image, hence I am stuck.

Is there a solution for this but with docker-compose?

EDIT: It seems like docker can read from my local machine, but cannot write to it. If I have mongod running on my local machine, I get logs indicating that a connection has been made to my docker container, and that data was sent to it:

2019-06-04T15:51:34.299-0400 I NETWORK  [listener] connection accepted from 172.23.0.3:48768 #8 (8 connections now open)
2019-06-04T15:51:34.299-0400 I NETWORK  [conn8] received client metadata from 172.23.0.3:48768 conn: { driver: { name: "PyMongo", version: "3.8.0" }, os: { type: "Linux", na    me: "Linux", architecture: "x86_64", version: "4.15.0-48-generic" }, platform: "CPython 3.6.8.final.0" }
2019-06-04T15:51:34.550-0400 I COMMAND  [conn8] command agents_proxies.user_agents command: getMore { getMore: 20847821675, collection: "user_agents", lsid: { id: UUID("69b1    fd25-36f8-49a4-8a14-bafc83483abb") }, $db: "agents_proxies", $readPreference: { mode: "primary" } } originatingCommand: { find: "user_agents", filter: { $and: [ { $or: [ { O    S: "Windows" }, { OS: "Mac OS X" }, { OS: "macOS" }, { OS: "Linux" } ] }, { $or: [ { hardware_type: "Computer" }, { hardware_type: "Windows" }, { hardware_type: "Linux" }, {     hardware_type: "Mac" } ] }, { $or: [ { popularity: "Very common" }, { popularity: "Common" } ] } ] }, projection: { _id: 0, user_agent: 1 }, lsid: { id: UUID("69b1fd25-36f8    -49a4-8a14-bafc83483abb") }, $db: "agents_proxies", $readPreference: { mode: "primaryPreferred" } } planSummary: COLLSCAN cursorid:20847821675 keysExamined:0 docsExamined:10    9441 cursorExhausted:1 numYields:855 nreturned:1188 reslen:163454 locks:{ Global: { acquireCount: { r: 1712 } }, Database: { acquireCount: { r: 856 } }, Collection: { acquir    eCount: { r: 856 } } } protocol:op_msg 248ms

However, when my python script attempts to store the data I receive the connection refused message from pymongo. I'm starting to think this has something to do with the airflow Dockerfile or the entrypoint.sh script.

Mohamed
  • 51
  • 3
  • 9
  • **EDIT**: So it turns out the reason I was getting the connection refused message was because I had altered my spider's settings. Turns out if a MONGO_URI is not stated in scrapy's custom_settings option, it will default to mongodb://localhost:27017, even if the project settings file defines a unique uri. That explains why I could connect to MongoDB, and receive information, but not store any data inside. This also explains why it would run with no problems on my local machine. – Mohamed Jun 05 '19 at 20:23

0 Answers0