0

I am trying to apply a sum-function to each cell of a column of a dataframe in spark. Each cell contains a list of integers which I would like to add up. However, the error I am getting is:

console:357: error: value sum is not a member of org.apache.spark.sql.ColumnName

for the example script below.

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

val spark = SparkSession.builder().getOrCreate()  

val df = spark.createDataFrame(Seq(
  (0, List(1,2,3)),
  (1, List(2,2,3)),
  (2, List(3,2,3)))).toDF("Id", "col_1")

val test = df.withColumn( "col_2", $"col_1".sum )

test.show()
Nyavro
  • 8,806
  • 2
  • 26
  • 33
Christian
  • 991
  • 2
  • 13
  • 25

1 Answers1

1

You can define a UDF.

scala> def sumFunc(a: Seq[Int]): Int = a.sum
sumFunc: (a: Seq[Int])Int

scala> val sumUdf = udf(sumFunc(_: Seq[Int]))
sumUdf: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,IntegerType,Some(List(ArrayType(IntegerType,false))))

scala> val test = df.withColumn( "col_2", sumUdf($"col_1") )
test: org.apache.spark.sql.DataFrame = [Id: int, col_1: array<int> ... 1 more field]

scala> test.collect
res0: Array[org.apache.spark.sql.Row] = Array([0,WrappedArray(1, 2, 3),6], [1,WrappedArray(2, 2, 3),7], [2,WrappedArray(3, 2, 3),8])
Terry Dactyl
  • 1,839
  • 12
  • 21