I think you can do it like this:
- Use the window function to get the
start_date
of each order_id
- Filter any rows which have a difference between the sales date and the start date by more than 12 months (This can be more precise per day)
- Group by
id
and sum the sales
Note: I have changed the input so it can filter the last record as out of range
from pyspark.sql.window import Window
from pyspark.sql.functions import first, col, months_between, lit, sum, max
df = spark.createDataFrame([
(1, 10, datetime.strptime("2020-04-30",'%Y-%m-%d')),\
(1, 6, datetime.strptime("2020-10-31",'%Y-%m-%d')),\
(1, 9, datetime.strptime("2020-09-30",'%Y-%m-%d')),\
(1, 2, datetime.strptime("2021-04-30",'%Y-%m-%d')),\
(2, 7, datetime.strptime("2020-07-31",'%Y-%m-%d')),\
(2, 8, datetime.strptime("2020-08-31",'%Y-%m-%d')),\
(2, 3, datetime.strptime("2021-06-30",'%Y-%m-%d')),\
(2, 2, datetime.strptime("2021-08-30",'%Y-%m-%d'))\
], ['id', 'sales', 'sales_date'])
win = Window.partitionBy('id').orderBy('sales_date')
df = df.withColumn('order_start_date', first(col('sales_date')).over(win)) \
.filter(months_between(col('sales_date'), col('order_start_date')) <= 12)
df.show()
+---+-----+-------------------+-------------------+
| id|sales| sales_date| order_start_date|
+---+-----+-------------------+-------------------+
| 1| 10|2020-04-30 00:00:00|2020-04-30 00:00:00|
| 1| 9|2020-09-30 00:00:00|2020-04-30 00:00:00|
| 1| 6|2020-10-31 00:00:00|2020-04-30 00:00:00|
| 1| 2|2021-04-30 00:00:00|2020-04-30 00:00:00|
| 2| 7|2020-07-31 00:00:00|2020-07-31 00:00:00|
| 2| 8|2020-08-31 00:00:00|2020-07-31 00:00:00|
| 2| 3|2021-06-30 00:00:00|2020-07-31 00:00:00|
+---+-----+-------------------+-------------------+
# 2021-08-30 was removed
df = df.groupBy('id').agg(\
sum('sales').alias('total_sales'), \
max('sales_date').alias('latest_sales_date')\
)
df.show()
+---+-----------+-------------------+
| id|total_sales| latest_sales_date|
+---+-----------+-------------------+
| 1| 27|2021-04-30 00:00:00|
| 2| 18|2021-06-30 00:00:00|
+---+-----------+-------------------+