The simplest way I found to do this is:
val data = Seq((1,2),(3,4)).toDF("a","a")
val deduped = data.toDF("a","a_2")
deduped.show
+---+---+
| a|a_2|
+---+---+
| 1| 2|
| 3| 4|
+---+---+
For a more general solution:
val data = Seq(
(1,2,3,4,5,6,7,8),
(9,0,1,2,3,4,5,6)
).toDF("a","b","c","a","d","b","e","b")
data.show
+---+---+---+---+---+---+---+---+
| a| b| c| a| d| b| e| b|
+---+---+---+---+---+---+---+---+
| 1| 2| 3| 4| 5| 6| 7| 8|
| 9| 0| 1| 2| 3| 4| 5| 6|
+---+---+---+---+---+---+---+---+
import scala.annotation.tailrec
def dedupeColumnNames(df: DataFrame): DataFrame = {
@tailrec
def dedupe(fixed_columns: List[String], columns: List[String]): List[String] = {
if (columns.isEmpty) fixed_columns
else {
val count = columns.groupBy(identity).mapValues(_.size)(columns.head)
if (count == 1) dedupe(columns.head :: fixed_columns, columns.tail)
else dedupe(s"${columns.head}_${count}":: fixed_columns, columns.tail)
}
}
val new_columns = dedupe(List.empty[String], df.columns.reverse.toList).toArray
df.toDF(new_columns:_*)
}
data
.transform(dedupeColumnNames)
.show
+---+---+---+---+---+---+---+---+
| a| b| c|a_2| d|b_2| e|b_3|
+---+---+---+---+---+---+---+---+
| 1| 2| 3| 4| 5| 6| 7| 8|
| 9| 0| 1| 2| 3| 4| 5| 6|
+---+---+---+---+---+---+---+---+