1

I created a very simple ETL solution that basically consists of reading CSV files from a directory called dataset, processing them in pyspark, and loading them into the MYSQL database. All of this is orchestrated via AIRFLOW which is triggered by the DOCKER.

The problem is that when I trigger my DAG, it returns an error saying "JAVA_HOME has not set"

Does anyone know how I can configure JAVA_HOME in the docker?

below is my docker-compose.yml

version: '2.2' services: redis: image: 'redis:5.0.5' # command: redis-server --requirepass redispass

postgres:
    image: postgres:9.6
    environment:
        - POSTGRES_USER=airflow
        - POSTGRES_PASSWORD=airflow
        - POSTGRES_DB=airflow
    ports:
        - "5432:5432"
    # Uncomment these lines to persist data on the local filesystem.
    #     - PGDATA=/var/lib/postgresql/data/pgdata
    # volumes:
    #     - ./pgdata:/var/lib/postgresql/data/pgdata

webserver:
    image: neylsoncrepalde/airflow-docker:2.0.0-pymongo
    restart: always
    depends_on:
        - postgres
        - redis
    environment:
        - LOAD_EX=n
        - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
        - EXECUTOR=Celery
        # - POSTGRES_USER=airflow
        # - POSTGRES_PASSWORD=airflow
        # - POSTGRES_DB=airflow
        # - REDIS_PASSWORD=redispass
    volumes:
        - ./dags:/usr/local/airflow/dags
        - ./data:/usr/local/airflow/data
        - ../../MYSQL:/usr/local/airflow/myslq
        # Uncomment to include custom plugins
        # ./plugins:/usr/local/airflow/plugins
    ports:
        - "8080:8080"
    command: webserver
    healthcheck:
        test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"]
        interval: 30s
        timeout: 30s
        retries: 3

flower:
    image: neylsoncrepalde/airflow-docker:2.0.0-pymongo
    restart: always
    depends_on:
        - redis
    environment:
        - EXECUTOR=Celery
        # - REDIS_PASSWORD=redispass
    ports:
        - "5555:5555"
    command: celery flower

scheduler:
    image: neylsoncrepalde/airflow-docker:2.0.0-pymongo
    restart: always
    depends_on:
        - webserver
    volumes:
        - ./dags:/usr/local/airflow/dags
        - ./data:/usr/local/airflow/data
        - ../../MYSQL:/usr/local/airflow/myslq
        # Uncomment to include custom plugins
        # - ./plugins:/usr/local/airflow/plugins
    environment:
        - LOAD_EX=n
        - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
        - EXECUTOR=Celery
        # - POSTGRES_USER=airflow
        # - POSTGRES_PASSWORD=airflow
        # - POSTGRES_DB=airflow
        # - REDIS_PASSWORD=redispass
    command: scheduler
   


worker:
    image: neylsoncrepalde/airflow-docker:2.0.0-pymongo
    restart: always
    depends_on:
        - scheduler
    volumes:
        - ./dags:/usr/local/airflow/dags
        - ./data:/usr/local/airflow/data
        - ../../MYSQL:/usr/local/airflow/myslq
        # Uncomment to include custom plugins
        # ./plugins:/usr/local/airflow/plugins
    environment:
        - FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
        - EXECUTOR=Celery
        - SPARK = spark://spark:7077
        # - POSTGRES_USER=airflow
        # - POSTGRES_PASSWORD=airflow
        # - POSTGRES_DB=airflow
        # - REDIS_PASSWORD=redispass
    command: celery worker
    
    
    


mysql:
    image: mysql
    environment:
        - MYSQL_ALLOW_EMPTY_PASSWORD=1
    restart: always

spark:
    image: bitnami/spark:3.1.2
    user: root # Run container as root container: https://docs.bitnami.com/tutorials/work-with-non-root-containers/
    hostname: spark
    networks:
        - default_net
    environment:
        - SPARK_MODE=master
        - SPARK_RPC_AUTHENTICATION_ENABLED=no
        - SPARK_RPC_ENCRYPTION_ENABLED=no
        - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
        - SPARK_SSL_ENABLED=no
    volumes:
        - ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
        - ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)
    ports:
        - "8181:8080"
        - "7077:7077"

spark-worker-1:
    image: bitnami/spark:3.1.2
    user: root
    networks:
        - default_net
    environment:
        - SPARK_MODE=worker
        - SPARK_MASTER_URL=spark://spark:7077
        - SPARK_WORKER_MEMORY=1G
        - SPARK_WORKER_CORES=1
        - SPARK_RPC_AUTHENTICATION_ENABLED=no
        - SPARK_RPC_ENCRYPTION_ENABLED=no
        - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
        - SPARK_SSL_ENABLED=no
    volumes:
        - ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
        - ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)

spark-worker-2:
    image: bitnami/spark:3.1.2
    user: root
    networks:
        - default_net
    environment:
        - SPARK_MODE=worker
        - SPARK_MASTER_URL=spark://spark:7077
        - SPARK_WORKER_MEMORY=1G
        - SPARK_WORKER_CORES=1
        - SPARK_RPC_AUTHENTICATION_ENABLED=no
        - SPARK_RPC_ENCRYPTION_ENABLED=no
        - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
        - SPARK_SSL_ENABLED=no
    volumes:
        - ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
        - ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)

spark-worker-3:
    image: bitnami/spark:3.1.2
    user: root
    networks:
        - default_net
    environment:
        - SPARK_MODE=worker
        - SPARK_MASTER_URL=spark://spark:7077
        - SPARK_WORKER_MEMORY=1G
        - SPARK_WORKER_CORES=1
        - SPARK_RPC_AUTHENTICATION_ENABLED=no
        - SPARK_RPC_ENCRYPTION_ENABLED=no
        - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
        - SPARK_SSL_ENABLED=no
    volumes:
        - ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
        - ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)

#Jupyter notebook
jupyter-spark:
    image: jupyter/pyspark-notebook:spark-3.1.2
    networks:
        - default_net
    ports:
      - "8888:8888"
      - "4040-4080:4040-4080"
    volumes:
      - ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
      - ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)

networks: default_net:

  • 1
    Try to use user `1001` instead of `root` in your spark-workers. [reference](https://github.com/bitnami/bitnami-docker-spark/blob/master/3/debian-10/Dockerfile#L31) – SaleemKhair Oct 02 '21 at 01:16
  • Thank you for your help. Unfortunately this solution did not work. Keeps giving the same error – Maycon Batestin Oct 03 '21 at 14:13

0 Answers0