I am exploring Spark Graphx to flatten hierarchical tree from Leaf to root. My dataset is stored tabular format with child and parent columns. There are many disjoint trees like this.
Expected result
2,7,8
5,7,8
3,4,8
I have gone through the GraphX wiki and still unable to figureout the approach. Any suggestions or Psuedo approach to solve this would be appreciated.
Sample Code-
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
object MSTGraphLauncher extends SparkSessionWrapper{
import spark.implicits._
def main(args: Array[String]) = {
val users: RDD[(VertexId, (String))] =
spark.sparkContext.parallelize(Seq((1L, ("rxin")),(2L, ("rxin")),(3L, ("rxin")),(4L, ("rxin")), (5L, ("jgonzal")),
(6L, ("franklin")), (7L, ("istoica")),(8L, ("istoica"))))
val relationships: RDD[Edge[String]] =
spark.sparkContext.parallelize(Seq(Edge(2L, 7L, "advisor"),Edge(5L, 7L, "advisor"),Edge(7L,8L, "advisor"),Edge(3L, 4L, "colleague"),Edge(4L, 8L, "collab"),
Edge(1L, 6L, "colleague")))
val graph = Graph(users, relationships)
}
}