I have applied the following code (which is based on this post) to my sample data to generate three different lists which I am trying to merge into a single data frame.
idNodes <- getNodeSet(plans, "//person[@id]")
ids <- lapply(idNodes, function(x) xmlAttrs(x)['id'])
attribact <- lapply(idNodes, xpathApply, path = "./plan[@selected='yes']//act", xmlAttrs)
attribleg <- lapply(idNodes, xpathApply, path = "./plan[@selected='yes']//leg", xmlAttrs)
To generate the data frame, I have tried to use x <- do.call(rbind.data.frame, mapply(cbind, ids, attribact, attribleg))
but it is giving me the followingt error:
Error in (function (..., deparse.level = 1, make.row.names = TRUE) : numbers of columns of arguments do not match In addition: There were 50 or more warnings (use warnings() to see the first 50)
I also want to point out that the above do.call
command works on small samples of data (with warnings) but not on large samples.
desired output
id type link x y start_time end_time mode dep_time trav_time arr_time
10000061 home 21258 334867.243653 3126570.70778 03:00:00 15:07:00 ride 15:07:00 00:03:28 15:10:28
10000061 shop 13904 332634.86999 3127078.96383 15:12:00 16:21:00 car 16:21:00 00:09:02 16:30:02
10000061 shop 14129 331666.364904 3129306.48785 16:25:00 17:37:00 ride 17:37:00 00:10:33 17:47:33
10000061 home 21258 334867.243653 3126570.70778 17:45:00 26:59:00 NA NA NA NA
10000302 home 21256 334598.361546 3126269.05167 03:00:00 07:56:00 car 07:56:00 00:03:31 07:59:31
10000302 work 14057 335957.065395 3128105.16619 08:04:00 10:28:00 car 10:28:00 00:06:47 10:34:47
10000302 social 21191 333032.807855 3128759.66141 10:33:00 11:52:00 car 11:52:00 00:07:50 11:59:50
10000302 home 21256 334598.361546 3126269.05167 11:59:00 12:11:00 car 12:11:00 00:04:49 12:15:49
10000302 social 13906 332302.159169 3127536.46778 12:17:00 13:30:00 car 13:30:00 00:05:30 13:35:30
10000302 home 21256 334598.361546 3126269.05167 13:36:00 26:59:00 NA NA NA NA
sample data
> dput(head(ids,2))
list(structure("10000061", .Names = "id"), structure("10000302", .Names = "id"))
> dput(head(attribact,2))
list(list(structure(c("home", "21258", "334867.243653", "3126570.70778", "03:00:00", "15:07:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("shop", "13904", "332634.86999", "3127078.96383", "15:12:00", "16:21:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("shop", "14129", "331666.364904", "3129306.48785", "16:25:00", "17:37:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("home", "21258", "334867.243653", "3126570.70778", "17:45:00", "26:59:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time"))), list(structure(c("home", "21256", "334598.361546", "3126269.05167", "03:00:00", "07:56:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("work", "14057", "335957.065395", "3128105.16619", "08:04:00", "10:28:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("social", "21191", "333032.807855", "3128759.66141", "10:33:00", "11:52:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("home", "21256", "334598.361546", "3126269.05167", "11:59:00", "12:11:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("social", "13906", "332302.159169", "3127536.46778", "12:17:00", "13:30:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("home", "21256", "334598.361546", "3126269.05167", "13:36:00", "26:59:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time"))))
> dput(head(attribleg,2))
list(list(structure(c("ride", "15:07:00", "00:03:28", "15:10:28"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "16:21:00", "00:09:02", "16:30:02"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("ride", "17:37:00", "00:10:33", "17:47:33"), .Names = c("mode", "dep_time", "trav_time", "arr_time"))), list(structure(c("car", "07:56:00", "00:03:31", "07:59:31"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "10:28:00", "00:06:47", "10:34:47"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "11:52:00", "00:07:50", "11:59:50"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "12:11:00", "00:04:49", "12:15:49"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "13:30:00", "00:05:30", "13:35:30"), .Names = c("mode", "dep_time", "trav_time", "arr_time"))))
UPDATE:
I have tried the following solution. But, it is very slow for my purposes (in spite of pre-allocation). Any suggestions that increase efficiency are greatly appreciated.
library(data.table)
df <- data.table(id=rep(0,10*length(ids)), type=rep("c",10*length(ids)), link=rep(0,10*length(ids)), x=rep(0,10*length(ids)), y=rep(0,10*length(ids)), start_time=rep("c",10*length(ids)), end_time=rep("c",10*length(ids)), mode=rep("c",10*length(ids)), dep_time=rep("c",10*length(ids)), trav_time=rep("c",10*length(ids)), arr_time=rep("c",10*length(ids)))
m <- 1
for (i in 1:length(ids))
{
for(k in 1: length(attribact[[i]]))
{
df[m,id := ids[[i]]]
df[m,type := attribact[[i]][[k]][[1]]]
df[m,link := attribact[[i]][[k]][[2]]]
df[m,x := attribact[[i]][[k]][[3]]]
df[m,y := attribact[[i]][[k]][[4]]]
df[m,start_time := attribact[[i]][[k]][[5]]]
df[m,end_time := attribact[[i]][[k]][[6]]]
df[m,mode := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[1]], NA)]
df[m,dep_time := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[2]], NA)]
df[m,trav_time := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[3]], NA)]
df[m,arr_time := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[4]], NA)]
m <- m+1
}
}