-1

Please see this example; I am trying to achieve this using spark sql/spark scala, but did not find any direct solution. Please let me know if it's not possible using Spark SQL / Spark Scala, in that case I can write a java/python program by writing a file out of As-Is.

enter image description here

thebluephantom
  • 16,458
  • 8
  • 40
  • 83
Don Sam
  • 525
  • 5
  • 20

2 Answers2

0
  1. github: https://github.com/mvasyliv/LearningSpark/blob/master/src/main/scala/spark/GroupListValueToColumn.scala

    1. source code { package spark

    import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._

    object GroupListValueToColumn extends App {

    val spark = SparkSession.builder() .master("local") .appName("Mapper") .getOrCreate()

    case class Customer( cust_id: Int, addresstype: String )

    import spark.implicits._

    val source = Seq( Customer(300312008, "credit_card"), Customer(300312008, "to"), Customer(300312008, "from"), Customer(300312009, "to"), Customer(300312009, "from"), Customer(300312010, "to"), Customer(300312010, "credit_card"), Customer(300312010, "from") ).toDF()

    val res = source.groupBy("cust_id").agg(collect_list("addresstype"))

    res.show(false) // +---------+-------------------------+ // |cust_id |collect_list(addresstype)| // +---------+-------------------------+ // |300312010|[to, credit_card, from] | // |300312008|[credit_card, to, from] | // |300312009|[to, from] | // +---------+-------------------------+

    val res1 = source.groupBy("cust_id").agg(collect_set("addresstype"))

    res1.show(false)

    // +---------+------------------------+ // |cust_id |collect_set(addresstype)| // +---------+------------------------+ // |300312010|[from, to, credit_card] | // |300312008|[from, to, credit_card] | // |300312009|[from, to] | // +---------+------------------------+ }

    }

mvasyliv
  • 1,214
  • 6
  • 10
0

Since answers are being given as opposed to good googling:

import org.apache.spark.sql.functions._
import spark.implicits._

val df = Seq(
   (1, "a"),
   (1, "c"),
   (2, "e")
   ).toDF("k", "v")

val df1 = df.groupBy("k").agg(collect_list("v"))
df1.show
thebluephantom
  • 16,458
  • 8
  • 40
  • 83