I'm new to Spark/PySpark and I'm trying to use it for Decision Trees. I got a full Decision Tree example working based off of a tutorial, but after a reboot (and minor OSX update), it's no longer working.
I've narrowed the issue down to a minimal example:
from pyspark.sql import SparkSession
print(SparkSession.builder.appName('test'))
spark = SparkSession.builder.appName('test').getOrCreate()
df = spark.read.csv('data/test.csv', header=True)
At first it couldn't find Spark, but after some work on my .bash_profile
, I think this issue is resolved. The first two lines to run without errors, and are producing the following output:
<pyspark.sql.session.SparkSession.Builder object at 0x108a89470>
But the third line now throws a NotADirectoryError
.
---------------------------------------------------------------------------
NotADirectoryError Traceback (most recent call last)
<ipython-input-9-db2674edf6f2> in <module>
1 from pyspark.sql import SparkSession
2 print(SparkSession.builder.appName('test'))
----> 3 spark = SparkSession.builder.appName('test').getOrCreate()
4 df = spark.read.csv('data/test.csv', header=True)
/anaconda3/envs/decisiontrees/lib/python3.5/site-packages/pyspark/sql/session.py in getOrCreate(self)
171 for key, value in self._options.items():
172 sparkConf.set(key, value)
--> 173 sc = SparkContext.getOrCreate(sparkConf)
174 # This SparkContext may be an existing one.
175 for key, value in self._options.items():
/anaconda3/envs/decisiontrees/lib/python3.5/site-packages/pyspark/context.py in getOrCreate(cls, conf)
365 with SparkContext._lock:
366 if SparkContext._active_spark_context is None:
--> 367 SparkContext(conf=conf or SparkConf())
368 return SparkContext._active_spark_context
369
/anaconda3/envs/decisiontrees/lib/python3.5/site-packages/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
131 " note this option will be removed in Spark 3.0")
132
--> 133 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
134 try:
135 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
/anaconda3/envs/decisiontrees/lib/python3.5/site-packages/pyspark/context.py in _ensure_initialized(cls, instance, gateway, conf)
314 with SparkContext._lock:
315 if not SparkContext._gateway:
--> 316 SparkContext._gateway = gateway or launch_gateway(conf)
317 SparkContext._jvm = SparkContext._gateway.jvm
318
/anaconda3/envs/decisiontrees/lib/python3.5/site-packages/pyspark/java_gateway.py in launch_gateway(conf)
44 :return: a JVM gateway
45 """
---> 46 return _launch_gateway(conf)
47
48
/anaconda3/envs/decisiontrees/lib/python3.5/site-packages/pyspark/java_gateway.py in _launch_gateway(conf, insecure)
96 def preexec_func():
97 signal.signal(signal.SIGINT, signal.SIG_IGN)
---> 98 proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
99 else:
100 # preexec_fn not supported on Windows
/anaconda3/envs/decisiontrees/lib/python3.5/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds)
674 c2pread, c2pwrite,
675 errread, errwrite,
--> 676 restore_signals, start_new_session)
677 except:
678 # Cleanup if the child failed starting.
/anaconda3/envs/decisiontrees/lib/python3.5/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
1287 else:
1288 err_msg += ': ' + repr(orig_executable)
-> 1289 raise child_exception_type(errno_num, err_msg)
1290 raise child_exception_type(err_msg)
1291
NotADirectoryError: [Errno 20] Not a directory
The documentation doesn't have any information on why getOrCreate()
might throw an error.
The minimal example is equivalent to this answer. I'm completely stumped as to why this isn't working.
Edit (2 weeks later): For any poor souls running into the same issue: I abandoned PySpark and am now using RPath instead. If I come back to this and figure it out, I will update.