2

I have a file which contains some Order_IDs and their Externsion_ID if exists. A new order can be fresh order, or an extension of existing Order_ID or an extension of an existing extension.

My problem is to add a new column named Parent_ID which marks the root of the Order_ID.

Please find the expected output as below :

enter image description here


A reproducible input is attached below.

df1 = structure(list(Order_ID = c("SL158", "SL159", "SL160", "SL162", 
                                  "SL164", "SL165", "SL168", "SL169", "SL170", "SL171", "SL172", 
                                  "SL176", "SL177", "SL178", "SL179", "SL180", "SL183", "SL184", 
                                  "SL189", "SL190", "SL191", "SL192", "SL193", "SL195", "SL196", 
                                  "SL199", "SL200", "SL201", "SL207", "SL208", "SL209", "SL218", 
                                  "SL219", "SL223", "SL224", "SL225", "SL226", "SL227", "SL229", 
                                  "SL232", "SL233", "SL234", "SL235", "SL239", "SL240", "SL241", 
                                  "SL242", "SL243", "SL251", "SL252", "SL257", "SL258", "SL260", 
                                  "SL261", "SL262", "SL266", "SL267", "SL268", "SL269", "SL277", 
                                  "SL278", "SL279", "SL280", "SL281", "SL287", "SL288", "SL289", 
                                  "SL300", "SL301", "SL302", "SL303", "SL304", "SL305", "SL315", 
                                  "SL316", "SL322", "SL323", "SL327", "SL328", "SL333", "SL334", 
                                  "SL335", "SL336", "SL337", "SL340", "SL341", "SL342", "SL343", 
                                  "SL344", "SL345", "SL350", "SL351", "SL352", "SL353", "SL354", 
                                  "SL355", "SL363", "SL364", "SL365", "SL366", "SL367", "SL368", 
                                  "SL369", "SL370", "SL376", "SL377", "SL378", "SL379", "SL380", 
                                  "SL381", "SL382", "SL383", "SL384", "SL385", "SL1217", "SL1452", 
                                  "SL4316", "SL4317", "SL4348", "SL4381", "SL4681", "SL4738", "SL5319", 
                                  "SL5520", "SL5703", "SL6132", "SL6244", "SL6855", "SL6997", "SLB1253161", 
                                  "SLB2970530", "SLB27287329", "SLB36502009", "SLB81913180", "SLB82838226", 
                                  "SLB90244936", "SLB99701642", "SL11995", "SLH5317239", "SLH22149557", 
                                  "SLH44727392", "SLH45803004", "SLH57801072", "SLH74470000", "SLH79063451", 
                                  "SL1134", "SL1011", "SL3686", "SL3691", "SL3695", "SL3716", "SL3718", 
                                  "SL3720", "SL3721", "SL3727", "SL5242", "SL5245", "SL5246", "SL5254", 
                                  "SL5255", "SL10126", "SL10134", "SL10143", "SL11333", "SL11338", 
                                  "SL11365", "SL11377", "SL11384", "SL10004", "SL10046", "SL10058", 
                                  "SL10070", "SL10092", "SL11335", "SL11364", "SL11366"), 
                     Extension_Of = c(NA, 
                                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
                                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
                                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
                                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
                                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
                                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
                                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
                                      NA, "SL1134", "SL1011", "SL3691", "SL3718", "SL3727", "SL3695", 
                                      "SL3720", "SL3716", "SL3721", "SL5242", "SL5246", "SL5245", "SL5254", 
                                      "SL5255", "SL3686", "SL11365", "SL11384", "SL11377", "SL10134", 
                                      "SL11333", "SL10143", "SL11338", "SL10126", "SL10046", "SL10070", 
                                      "SL11364", "SL11335", "SL10004", "SL10058", "SL11366", "SL10092", 
                                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
                                      NA, NA, NA, NA, NA, NA, NA, NA, "SL384", NA, NA, "SL171", NA, 
                                      NA, NA)),
                row.names = c(NA, -176L),
                class = c("tbl_df", "tbl", "data.frame"))

head(df1)
#  Order_ID Extension_Of
#1    SL158         <NA>
#2    SL159         <NA>
#3    SL160         <NA>
#4    SL162         <NA>
#5    SL164         <NA>
#6    SL165         <NA>
zx8754
  • 52,746
  • 12
  • 114
  • 209
sunitprasad1
  • 768
  • 2
  • 12
  • 28

1 Answers1

1

Here is a solution based on igraph:

library(igraph) # 1.2.1

v <- data.frame(name = unique(unlist(df1)), stringsAsFactors = FALSE)
v <- v[!is.na(v$name), ]

g <- graph_from_data_frame(df1[!is.na(df1$Extension_Of), 2:1], vertices = v)

df1$Parent_ID <- sapply(df1$Order_ID, function(oid){
    n <- ego(g, order = nrow(df1), oid, mode = 'in')[[1]]
    nin <- lapply(n, function(x){ego(g, order = nrow(df1), x, mode = 'in')[[1]]})
    root <- n[lengths(nin) == 1]$name
})

df1[df1$Parent_ID == 'SL384', ]
#     Order_ID Extension_Of Parent_ID
# 113    SL384         <NA>     SL384
# 138  SL11995      SL10046     SL384
# 170  SL10046        SL384     SL384

This answer is inspired by this answer and this function.

The rationale: Each line without NA in df1 can be treated as an edge in a graph. if B is extension of A, we have an edge A -> B. If C is extension of B, we get B->C. Then the problem can be rephrased as: for each node (Order_ID), find its root node. For C, its root node is A since (A->B->C).

In the code above, for Order_ID, ego finds all the nodes that are directly or indirectly upstream of it (including itself). Among those upstream nodes, we can determine the root node as the one without other upstream nodes.

mt1022
  • 16,834
  • 5
  • 48
  • 71