You can use last()
for filling and Window
for sorting:
from pyspark.sql import Row, Window, functions as F
df = sql_context.createDataFrame([
Row(Month=datetime.date(2021,1,1), Rating=None),
Row(Month=datetime.date(2021,2,1), Rating=None),
Row(Month=datetime.date(2021,3,1), Rating=0.6),
Row(Month=datetime.date(2021,4,1), Rating=1.2),
Row(Month=datetime.date(2021,5,1), Rating=1.),
Row(Month=datetime.date(2021,6,1), Rating=None),
Row(Month=datetime.date(2021,7,1), Rating=None),
])
(
df
.withColumn('Rating',
F.when(F.isnull('Rating'),
F.last('Rating', ignorenulls=True).over(Window.orderBy('Month'))
).otherwise(F.col('Rating')))
# This second run below is only required for the first rows in the DF
.withColumn('Rating',
F.when(F.isnull('Rating'),
F.last('Rating', ignorenulls=True).over(Window.orderBy(F.desc('Month')))
).otherwise(F.col('Rating')))
.sort('Month') # Only required for output formatting
.show()
)
# Output
+----------+------+
| Month|Rating|
+----------+------+
|2021-01-01| 0.6|
|2021-02-01| 0.6|
|2021-03-01| 0.6|
|2021-04-01| 1.2|
|2021-05-01| 1.0|
|2021-06-01| 1.0|
|2021-07-01| 1.0|
+----------+------+