I created a very simple ETL solution that basically consists of reading CSV files from a directory called dataset, processing them in pyspark, and loading them into the MYSQL database. All of this is orchestrated via AIRFLOW which is triggered by the DOCKER.
The problem is that when I trigger my DAG, it returns an error saying "JAVA_HOME has not set"
Does anyone know how I can configure JAVA_HOME in the docker?
below is my docker-compose.yml
version: '2.2' services: redis: image: 'redis:5.0.5' # command: redis-server --requirepass redispass
postgres:
image: postgres:9.6
environment:
- POSTGRES_USER=airflow
- POSTGRES_PASSWORD=airflow
- POSTGRES_DB=airflow
ports:
- "5432:5432"
# Uncomment these lines to persist data on the local filesystem.
# - PGDATA=/var/lib/postgresql/data/pgdata
# volumes:
# - ./pgdata:/var/lib/postgresql/data/pgdata
webserver:
image: neylsoncrepalde/airflow-docker:2.0.0-pymongo
restart: always
depends_on:
- postgres
- redis
environment:
- LOAD_EX=n
- FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
- EXECUTOR=Celery
# - POSTGRES_USER=airflow
# - POSTGRES_PASSWORD=airflow
# - POSTGRES_DB=airflow
# - REDIS_PASSWORD=redispass
volumes:
- ./dags:/usr/local/airflow/dags
- ./data:/usr/local/airflow/data
- ../../MYSQL:/usr/local/airflow/myslq
# Uncomment to include custom plugins
# ./plugins:/usr/local/airflow/plugins
ports:
- "8080:8080"
command: webserver
healthcheck:
test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"]
interval: 30s
timeout: 30s
retries: 3
flower:
image: neylsoncrepalde/airflow-docker:2.0.0-pymongo
restart: always
depends_on:
- redis
environment:
- EXECUTOR=Celery
# - REDIS_PASSWORD=redispass
ports:
- "5555:5555"
command: celery flower
scheduler:
image: neylsoncrepalde/airflow-docker:2.0.0-pymongo
restart: always
depends_on:
- webserver
volumes:
- ./dags:/usr/local/airflow/dags
- ./data:/usr/local/airflow/data
- ../../MYSQL:/usr/local/airflow/myslq
# Uncomment to include custom plugins
# - ./plugins:/usr/local/airflow/plugins
environment:
- LOAD_EX=n
- FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
- EXECUTOR=Celery
# - POSTGRES_USER=airflow
# - POSTGRES_PASSWORD=airflow
# - POSTGRES_DB=airflow
# - REDIS_PASSWORD=redispass
command: scheduler
worker:
image: neylsoncrepalde/airflow-docker:2.0.0-pymongo
restart: always
depends_on:
- scheduler
volumes:
- ./dags:/usr/local/airflow/dags
- ./data:/usr/local/airflow/data
- ../../MYSQL:/usr/local/airflow/myslq
# Uncomment to include custom plugins
# ./plugins:/usr/local/airflow/plugins
environment:
- FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf87WFwLbfzqDDho=
- EXECUTOR=Celery
- SPARK = spark://spark:7077
# - POSTGRES_USER=airflow
# - POSTGRES_PASSWORD=airflow
# - POSTGRES_DB=airflow
# - REDIS_PASSWORD=redispass
command: celery worker
mysql:
image: mysql
environment:
- MYSQL_ALLOW_EMPTY_PASSWORD=1
restart: always
spark:
image: bitnami/spark:3.1.2
user: root # Run container as root container: https://docs.bitnami.com/tutorials/work-with-non-root-containers/
hostname: spark
networks:
- default_net
environment:
- SPARK_MODE=master
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
volumes:
- ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
- ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)
ports:
- "8181:8080"
- "7077:7077"
spark-worker-1:
image: bitnami/spark:3.1.2
user: root
networks:
- default_net
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark:7077
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_CORES=1
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
volumes:
- ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
- ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)
spark-worker-2:
image: bitnami/spark:3.1.2
user: root
networks:
- default_net
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark:7077
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_CORES=1
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
volumes:
- ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
- ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)
spark-worker-3:
image: bitnami/spark:3.1.2
user: root
networks:
- default_net
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark:7077
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_CORES=1
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
volumes:
- ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
- ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)
#Jupyter notebook
jupyter-spark:
image: jupyter/pyspark-notebook:spark-3.1.2
networks:
- default_net
ports:
- "8888:8888"
- "4040-4080:4040-4080"
volumes:
- ../spark/app:/usr/local/airflow/myslq # Spark scripts folder (Must be the same path in airflow and Spark Cluster)
- ../spark/resources:/usr/local/airflow/myslq #Resources folder (Must be the same path in airflow and Spark Cluster)
networks: default_net: