0

A bit new to pyspark programming:

Having a dataframe df in my databricks notebook:

|-- result: struct (nullable = true)
 |    |-- approver: struct (nullable = true)
 |    |    |-- display_value: string (nullable = true)
 |    |    |-- link: string (nullable = true)
 |    |-- comments: string (nullable = true)
 |    |-- document_id: struct (nullable = true)
 |    |    |-- display_value: string (nullable = true)
 |    |    |-- link: string (nullable = true)
 |    |-- group.assignment_group: struct (nullable = true)
 |    |    |-- display_value: string (nullable = true)
 |    |    |-- link: string (nullable = true)
 |    |-- source_table: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- sys_created_on: string (nullable = true)
 |    |-- sys_updated_on: string (nullable = true)

How to rename group.assignment_group to group_assignment_group because the dot is preventing the code to recognize the column when i try to flatten the file?

I am stuck on the following code and not sure how to expand the str_schema to account for the schema i have?

str_schema = "result<struct<approver:struct<display_value:string,link:string>>"

df.select(col("array_field").cast(str_schema)).printSchema()
Piccinin1992
  • 237
  • 3
  • 12

2 Answers2

0

Renaming the nested column which has "." in it:

from pyspark.sql.functions import struct

df \
.select("result.*") \
.withColumnRenamed(f"group.assignment_group", "group_assignment_group") \
.withColumn("result", struct("group_assignment_group")) \
.drop("group_assignment_group") \
.printSchema()

Output:

root
 |-- result: struct (nullable = false)
 |    |-- group_assignment_group: struct (nullable = true)
 |    |    |-- display: string (nullable = true)
 |    |    |-- value: string (nullable = true)
arudsekaberne
  • 830
  • 4
  • 11
0

You need to flatten it out, rename it and then re-nest it if needed. Here's an example of how you can do it:

# ...your code here

df.printSchema()


# Flatten the DataFrame with renamed column
df_flat = df.selectExpr("result.approver",
                        "result.comments",
                        "result.document_id",
                        "result.`group.assignment_group` as group_assignment_group",
                        "result.source_table",
                        "result.state",
                        "result.sys_created_on",
                        "result.sys_updated_on")

df_flat.printSchema()

# Create the nested DataFrame
df_nested = df_flat.select(
    F.struct(
        F.col("approver"),
        F.col("comments"),
        F.col("document_id"),
        F.col("group_assignment_group"),
        F.col("source_table"),
        F.col("state"),
        F.col("sys_created_on"),
        F.col("sys_updated_on")
    ).alias("result")
)

df_nested.printSchema()

Result:

root
 |-- result: struct (nullable = true)
 |    |-- approver: struct (nullable = true)
 |    |    |-- display_value: string (nullable = true)
 |    |    |-- link: string (nullable = true)
 |    |-- comments: string (nullable = true)
 |    |-- document_id: struct (nullable = true)
 |    |    |-- display_value: string (nullable = true)
 |    |    |-- link: string (nullable = true)
 |    |-- group.assignment_group: struct (nullable = true)
 |    |    |-- display_value: string (nullable = true)
 |    |    |-- link: string (nullable = true)
 |    |-- source_table: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- sys_created_on: string (nullable = true)
 |    |-- sys_updated_on: string (nullable = true)

root
 |-- approver: struct (nullable = true)
 |    |-- display_value: string (nullable = true)
 |    |-- link: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- document_id: struct (nullable = true)
 |    |-- display_value: string (nullable = true)
 |    |-- link: string (nullable = true)
 |-- group_assignment_group: struct (nullable = true)
 |    |-- display_value: string (nullable = true)
 |    |-- link: string (nullable = true)
 |-- source_table: string (nullable = true)
 |-- state: string (nullable = true)
 |-- sys_created_on: string (nullable = true)
 |-- sys_updated_on: string (nullable = true)

root
 |-- result: struct (nullable = false)
 |    |-- approver: struct (nullable = true)
 |    |    |-- display_value: string (nullable = true)
 |    |    |-- link: string (nullable = true)
 |    |-- comments: string (nullable = true)
 |    |-- document_id: struct (nullable = true)
 |    |    |-- display_value: string (nullable = true)
 |    |    |-- link: string (nullable = true)
 |    |-- group_assignment_group: struct (nullable = true)
 |    |    |-- display_value: string (nullable = true)
 |    |    |-- link: string (nullable = true)
 |    |-- source_table: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- sys_created_on: string (nullable = true)
 |    |-- sys_updated_on: string (nullable = true)
Taras Drapalyuk
  • 473
  • 3
  • 6