I built the jupyter/pyspark-notebook Docker image. I installed geomesa_pyspark and tried to run the following example commands from the official guide.
import geomesa_pyspark
conf = geomesa_pyspark.configure(
jars=['/usr/local/spark/jars/geomesa-accumulo-spark-runtime_2.11-2.0.0.jar'],
packages=['geomesa_pyspark','pytz'],
spark_home='/usr/local/spark/').\
setAppName('MyTestApp')
conf.get('spark.master')
from pyspark.sql import SparkSession
spark = ( SparkSession
.builder
.config(conf=conf)
.enableHiveSupport()
.getOrCreate()
)
The same problem persists.
Exception Traceback (most recent call last)
<ipython-input-4-eca73e557583> in <module>
22 from pyspark.sql import SparkSession
23
---> 24 spark = ( SparkSession
25 .builder
26 .config(conf=conf)
/usr/local/spark/python/pyspark/sql/session.py in getOrCreate(self)
226 sparkConf.set(key, value)
227 # This SparkContext may be an existing one.
--> 228 sc = SparkContext.getOrCreate(sparkConf)
229 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
230 # by all sessions.
/usr/local/spark/python/pyspark/context.py in getOrCreate(cls, conf)
382 with SparkContext._lock:
383 if SparkContext._active_spark_context is None:
--> 384 SparkContext(conf=conf or SparkConf())
385 return SparkContext._active_spark_context
386
/usr/local/spark/python/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
142 " is not allowed as it is a security risk.")
143
--> 144 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
145 try:
146 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
/usr/local/spark/python/pyspark/context.py in _ensure_initialized(cls, instance, gateway, conf)
329 with SparkContext._lock:
330 if not SparkContext._gateway:
--> 331 SparkContext._gateway = gateway or launch_gateway(conf)
332 SparkContext._jvm = SparkContext._gateway.jvm
333
/usr/local/spark/python/pyspark/java_gateway.py in launch_gateway(conf, popen_kwargs)
106
107 if not os.path.isfile(conn_info_file):
--> 108 raise Exception("Java gateway process exited before sending its port number")
109
110 with open(conn_info_file, "rb") as info:
Exception: Java gateway process exited before sending its port number
In the logs on Portainer instead I read this exception:
Exception in thread "main" org.apache.spark.SparkException: When running with master 'yarn' either HADOOP_CONF_DIR or YARN_CONF_DIR must be set in the environment.
at org.apache.spark.deploy.SparkSubmitArguments.error(SparkSubmitArguments.scala:631)
at org.apache.spark.deploy.SparkSubmitArguments.validateSubmitArguments(SparkSubmitArguments.scala:271)
at org.apache.spark.deploy.SparkSubmitArguments.validateArguments(SparkSubmitArguments.scala:234)
at org.apache.spark.deploy.SparkSubmitArguments.<init>(SparkSubmitArguments.scala:119)
at org.apache.spark.deploy.SparkSubmit$$anon$2$$anon$3.<init>(SparkSubmit.scala:1013)
at org.apache.spark.deploy.SparkSubmit$$anon$2.parseArguments(SparkSubmit.scala:1013)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:85)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1030)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1039)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
I think the JAVA_HOME is set correctly (JAVA_HOME="/usr/lib/jvm/java-1.11.0-openjdk-amd64"). From the logs I guess that HADOOP_HOME should also be set, but despite the fact that the installed Hadoop version is shown in the Docker image specifications, when I run the container I can't find Hadoop anywhere. One way thing is that if I run the same commands from the pyspark shell instead of from the Notebook, it runs correctly without errors.