Here is a small trick to use column expressions. It's quite neat because there is no use of udf. But still the functional interface makes it quite likable for me.
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StringType, StructField, StructType
from pyspark.sql.functions import split, size
from pyspark.sql import Column
spark = SparkSession.builder.getOrCreate()
data = [
('filename', 's3:/hello/no.csv'),
('filename', 's3:/hello/why.csv')
]
schema = StructType([
StructField('name', StringType(), True),
StructField('path', StringType(), True)
])
df = spark.createDataFrame(data, schema=schema)
def expression_last_item_of_array(split_column: str, split_delimeter: str) -> Column:
"""
Given column name and delimeter, return expression
for splitting string and returning last item of the array.
Args:
split_column: str
split_delimeter: str
Returns:
pysaprk.sql.Column
"""
expression = split(split_column, split_delimeter)
n = size(expression)
last = expression.getItem(n - 1)
return last, n
last, n = expression_last_item_of_array('path', '/')
df.show(),
df.select(last.alias('last_element'), n.alias('n_items')).show(), df.select(last.alias('last_element')).show()
Output:
+--------+-----------------+
| name| path|
+--------+-----------------+
|filename| s3:/hello/no.csv|
|filename|s3:/hello/why.csv|
+--------+-----------------+
+------------+-------+
|last_element|n_items|
+------------+-------+
| no.csv| 3|
| why.csv| 3|
+------------+-------+
+------------+
|last_element|
+------------+
| no.csv|
| why.csv|
+------------+