I had the same issue yesterday, I wanted to write spark jobs in other files and wanted to run them by using singleton spark session so I did this:
main.py
from pyspark.sql import SparkSession
from job1 import spark_job
import os
if __name__=="__main__":
print("trying to start spark")
spark = SparkSession.builder.master('local[*]').appName("PythonPi").getOrCreate()
sc = spark.sparkContext
# to be able to use paths.py in job's files
sc.addPyFile(os.getcwd()+'/spark_jobs/paths.py')
spark_job(spark, sc)
spark.stop()
paths.py
import os
def get_path(file=__file__):
return os.path.join(os.getcwd(),file)
job1.py
from paths import get_path
data = [('James','','Smith','1991-04-01','M',3000),
('Michael','Rose','','2000-05-19','M',4000),
('Robert','','Williams','1978-09-05','M',4000),
('Maria','Anne','Jones','1967-12-01','F',4000),
('Jen','Mary','Brown','1980-02-17','F',-1)
]
columns = ["firstname","middlename","lastname","dob","gender","salary"]
def spark_job(spark,sc):
sc.addPyFile(get_path(__file__))
df = spark.createDataFrame(data=data, schema = columns)
df.createOrReplaceTempView("PERSON_DATA")
groupDF = spark.sql("SELECT gender, count(*) from PERSON_DATA group by gender")
groupDF.show()
This way I was able to call spark jobs from different files OR add py files in sparkContext
hope it helps someone.