I am trying to load data in a dataframe using pyspark. The files are in parquet format. I am using the following code
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,BooleanType,DateType,TimestampType,LongType,FloatType,DoubleType,ArrayType,ShortType
from pyspark.sql import HiveContext
from pyspark.sql.functions import lit
import datetime
from pyspark import SparkContext
from pyspark import SQLContext
from datetime import datetime
from datetime import *
from datetime import date, timedelta as td
import datetime
from datetime import datetime
from pyspark import SparkContext
from pyspark.sql import HiveContext
import pandas as pd
daterange = pd.date_range('2019-12-01','2019-12-31')
df = sqlContext.createDataFrame(sc.emptyRDD())
for process_date in daterange:
try:
name = 's3://location/process_date={}'.format(process_date.strftime("%Y-%m-%d"))+'/'
print(name)
x = spark.read.parquet(name)
x = x.withColumn('process_date',lit(process_date.strftime("%Y-%m-%d")))
x.show()
df = df.union(x)
except:
print("File doesnt exist for"+str(process_date.strftime("%Y-%m-%d")))
But when i am running this code, i am getting the output df is an empty data set and despite having data for some dates, i am getting exception print message in all the date range. Can anyone guide me what i am doing wrong?