0

My dataframe, df, has columns comprising 2-dimensional (x,y) data. Combining these columns with VectorAssembler into the 'features' column results in all these pairs being flattened. Is there a way to have these columns represented in their original form i.e. as [[x1,y1],[x2,y2],[x3,y3]] instead of what I am getting: [x1,y1,x2,y2,x3,y3]

import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.feature.VectorAssembler

val df = Seq((Seq(1.0,2.0), Seq(3.0,4.0), Seq(5.0,6.0)),
              (Seq(7.0,8.0), Seq(9.0,10.0), Seq(11.0,12.0))).toDF("f1", "f2", "f3")

//Ref https://stackoverflow.com/a/41091839/4106464
val seqAsVector = udf((xs: Seq[Double]) => Vectors.dense(xs.toArray))

val df_final = df.select(seqAsVector(col("f1")).as("f1"), seqAsVector(col("f2")).as("f2"), seqAsVector(col("f3")).as("f3"))

val assembler = new VectorAssembler().setInputCols(Array("f1","f2","f3")).setOutputCol("features")
val df_out = assembler.transform(df_final)

df.show
df_out.show(false)

// df
//+----------+-----------+------------+
//|        f1|         f2|          f3|
//+----------+-----------+------------+
//|[1.0, 2.0]| [3.0, 4.0]|  [5.0, 6.0]|
//|[7.0, 8.0]|[9.0, 10.0]|[11.0, 12.0]|
//+----------+-----------+------------+

// df_out with VectorAssembler
//+---------+----------+-----------+----------------------------+
//|f1       |f2        |f3         |features                    |
//+---------+----------+-----------+----------------------------+
//|[1.0,2.0]|[3.0,4.0] |[5.0,6.0]  |[1.0,2.0,3.0,4.0,5.0,6.0]   |
//|[7.0,8.0]|[9.0,10.0]|[11.0,12.0]|[7.0,8.0,9.0,10.0,11.0,12.0]|
//+---------+----------+-----------+----------------------------+

//Desired features column:

//+---------+----------+-----------+----------------------------------+
//|f1       |f2        |f3         |features                          |
//+---------+----------+-----------+----------------------------------+
//|[1.0,2.0]|[3.0,4.0] |[5.0,6.0]  |[[1.0,2.0],[3.0,4.0],[5.0,6.0]]   |
//|[7.0,8.0]|[9.0,10.0]|[11.0,12.0]|[[7.0,8.0],[9.0,10.0],[11.0,12.0]]|
//+---------+----------+-----------+----------------------------------+
durranaik
  • 21
  • 2

0 Answers0