2

I have applied the following code (which is based on this post) to my sample data to generate three different lists which I am trying to merge into a single data frame.

idNodes <- getNodeSet(plans, "//person[@id]") ids <- lapply(idNodes, function(x) xmlAttrs(x)['id']) attribact <- lapply(idNodes, xpathApply, path = "./plan[@selected='yes']//act", xmlAttrs) attribleg <- lapply(idNodes, xpathApply, path = "./plan[@selected='yes']//leg", xmlAttrs)

To generate the data frame, I have tried to use x <- do.call(rbind.data.frame, mapply(cbind, ids, attribact, attribleg)) but it is giving me the followingt error:

Error in (function (..., deparse.level = 1, make.row.names = TRUE) : numbers of columns of arguments do not match In addition: There were 50 or more warnings (use warnings() to see the first 50)

I also want to point out that the above do.call command works on small samples of data (with warnings) but not on large samples.

desired output

id        type   link   x              y              start_time end_time   mode  dep_time   trav_time arr_time
10000061  home   21258  334867.243653  3126570.70778  03:00:00   15:07:00   ride  15:07:00   00:03:28  15:10:28 
10000061  shop   13904  332634.86999   3127078.96383  15:12:00   16:21:00   car   16:21:00   00:09:02  16:30:02 
10000061  shop   14129  331666.364904  3129306.48785  16:25:00   17:37:00   ride  17:37:00   00:10:33  17:47:33 
10000061  home   21258  334867.243653  3126570.70778  17:45:00   26:59:00   NA    NA         NA        NA
10000302  home   21256  334598.361546  3126269.05167  03:00:00   07:56:00   car   07:56:00   00:03:31  07:59:31 
10000302  work   14057  335957.065395  3128105.16619  08:04:00   10:28:00   car   10:28:00   00:06:47  10:34:47 
10000302  social 21191  333032.807855  3128759.66141  10:33:00   11:52:00   car   11:52:00   00:07:50  11:59:50 
10000302  home   21256  334598.361546  3126269.05167  11:59:00   12:11:00   car   12:11:00   00:04:49  12:15:49 
10000302  social 13906  332302.159169  3127536.46778  12:17:00   13:30:00   car   13:30:00   00:05:30  13:35:30 
10000302  home   21256  334598.361546  3126269.05167  13:36:00   26:59:00   NA    NA         NA        NA

sample data

> dput(head(ids,2))
list(structure("10000061", .Names = "id"), structure("10000302", .Names = "id"))

> dput(head(attribact,2))
list(list(structure(c("home", "21258", "334867.243653", "3126570.70778", "03:00:00", "15:07:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("shop", "13904", "332634.86999", "3127078.96383", "15:12:00", "16:21:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("shop", "14129", "331666.364904", "3129306.48785", "16:25:00", "17:37:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("home", "21258", "334867.243653", "3126570.70778", "17:45:00", "26:59:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time"))), list(structure(c("home", "21256", "334598.361546", "3126269.05167", "03:00:00", "07:56:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("work", "14057", "335957.065395", "3128105.16619", "08:04:00", "10:28:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("social", "21191", "333032.807855", "3128759.66141", "10:33:00", "11:52:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("home", "21256", "334598.361546", "3126269.05167", "11:59:00", "12:11:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("social", "13906", "332302.159169", "3127536.46778", "12:17:00", "13:30:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time")), structure(c("home", "21256", "334598.361546", "3126269.05167", "13:36:00", "26:59:00"), .Names = c("type", "link", "x", "y", "start_time", "end_time"))))

> dput(head(attribleg,2))
list(list(structure(c("ride", "15:07:00", "00:03:28", "15:10:28"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "16:21:00", "00:09:02", "16:30:02"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("ride", "17:37:00", "00:10:33", "17:47:33"), .Names = c("mode", "dep_time", "trav_time", "arr_time"))), list(structure(c("car", "07:56:00", "00:03:31", "07:59:31"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "10:28:00", "00:06:47", "10:34:47"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "11:52:00", "00:07:50", "11:59:50"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "12:11:00", "00:04:49", "12:15:49"), .Names = c("mode", "dep_time", "trav_time", "arr_time")), structure(c("car", "13:30:00", "00:05:30", "13:35:30"), .Names = c("mode", "dep_time", "trav_time", "arr_time"))))

UPDATE:

I have tried the following solution. But, it is very slow for my purposes (in spite of pre-allocation). Any suggestions that increase efficiency are greatly appreciated.

library(data.table)
df <- data.table(id=rep(0,10*length(ids)), type=rep("c",10*length(ids)), link=rep(0,10*length(ids)), x=rep(0,10*length(ids)), y=rep(0,10*length(ids)), start_time=rep("c",10*length(ids)), end_time=rep("c",10*length(ids)), mode=rep("c",10*length(ids)), dep_time=rep("c",10*length(ids)), trav_time=rep("c",10*length(ids)), arr_time=rep("c",10*length(ids)))
m <- 1
for (i in 1:length(ids))
{
  for(k in 1: length(attribact[[i]]))
  {
    df[m,id := ids[[i]]]
    df[m,type := attribact[[i]][[k]][[1]]]
    df[m,link := attribact[[i]][[k]][[2]]]
    df[m,x := attribact[[i]][[k]][[3]]]
    df[m,y := attribact[[i]][[k]][[4]]]
    df[m,start_time := attribact[[i]][[k]][[5]]]
    df[m,end_time := attribact[[i]][[k]][[6]]]
    df[m,mode := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[1]], NA)]
    df[m,dep_time := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[2]], NA)]
    df[m,trav_time := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[3]], NA)]
    df[m,arr_time := ifelse(length(attribleg[[i]])>=k, attribleg[[i]][[k]][[4]], NA)]
    m <- m+1
  }
}
Community
  • 1
  • 1
dataanalyst
  • 316
  • 3
  • 12

2 Answers2

1

Instead of three separate lists, I would get act and leg tags together using /* and add ids as the list names.

a <- lapply(idNodes, xpathApply, path = "./plan[@selected='yes']/*", xmlAttrs)
names(a) <- sapply(idNodes, xmlGetAttr, "id")
# combine using ldply
library(plyr)
x1 <- lapply(a, ldply, "rbind")
x <- ldply( x1, "rbind", .id="id")

Now you just need to format the data.frame and move leg attributes up 1 row (if leg is always the next sibling of act?).

n <- which(is.na(x$type) )
x[n-1, 8:11] <- x[n,8:11]
x <- subset(x,!is.na(type))
rownames(x) <- NULL
x   
         id   type  link             x             y start_time end_time mode dep_time trav_time arr_time
1  10000061   home 21258 334867.243653 3126570.70778   03:00:00 15:07:00 ride 15:07:00  00:03:27 15:10:27
2  10000061   shop 13904  332634.86999 3127078.96383   15:12:00 16:21:00  car 16:21:00  00:09:44 16:30:44
3  10000061   shop 14129 331666.364904 3129306.48785   16:25:00 17:37:00 ride 17:37:00  00:09:46 17:46:46
4  10000061   home 21258 334867.243653 3126570.70778   17:45:00 26:59:00 <NA>     <NA>      <NA>     <NA>
5  10000302   home 21256 334598.361546 3126269.05167   03:00:00 07:56:00  car 07:56:00  00:03:00 07:59:00
6  10000302   work 14057 335957.065395 3128105.16619   08:04:00 10:28:00  car 10:28:00  00:08:20 10:36:20
7  10000302 social 21191 333032.807855 3128759.66141   10:33:00 11:52:00  car 11:52:00  00:08:33 12:00:33
8  10000302   home 21256 334598.361546 3126269.05167   11:59:00 12:11:00  car 12:11:00  00:06:35 12:17:35
9  10000302 social 13906 332302.159169 3127536.46778   12:17:00 13:30:00  car 13:30:00  00:05:30 13:35:30
10 10000302   home 21256 334598.361546 3126269.05167   13:36:00 26:59:00 <NA>     <NA>      <NA>     <NA>

Another option is to skip idNodes and maybe just format the xmlAttrsToDataFrame output below.

x <- XML:::xmlAttrsToDataFrame(plans["//person[@id]|//plan[@selected='yes']/*"])
Chris S.
  • 2,185
  • 1
  • 14
  • 14
  • Thanks Chris. Your second option is much more helpful (for the specific sample in the question). In fact, it performs orders of magnitude (about 9.5 times better than the first option). Here is a code I came up with based on your second option. `x <- XML:::xmlAttrsToDataFrame(plans["//person[@id]|//plan[@selected='yes']/*"]) z1 <- data.frame(x$id, lapply(x[,2:7], function (z) shift(z, 1, type='lead')), lapply(x[,8:11], function (z) shift(z, 2, type='lead'))) z1 <- z1[rowSums(is.na(z1)) != ncol(z1),] z1 <- cbind(na.locf(as.numeric(as.character(z1$x.id))), z1[,2:ncol(z1)])` – dataanalyst Mar 17 '16 at 12:17
  • But, surprisingly, the second code performs extremely slow as the size of the data increases. On a sample with approximately 114272 persons, `x <-XML:::xmlAttrsToDataFrame(plans["//person[@id]|//plan[@selected='yes']/*"])` is extremely slow (it has been running for more than 2 hours now). The entire code under the first version takes less than 2 hours. Do you have any thoughts on why this is the case? – dataanalyst Mar 17 '16 at 14:02
  • xmlToDataFrame and related are convenience functions so they are not optimized for speed and work best with smaller files. Maybe google xmlToDataFrame slow for some suggestions on faster XML parsing methods? – Chris S. Mar 17 '16 at 16:56
  • Thanks for the answer Chris. – dataanalyst Mar 17 '16 at 18:39
1

This could be an option considering the three lists as 'a', 'b' and 'c'

First assign the ids in list 'a' as names of list 'b' and 'c' and then rbind each elements in the list 'b' and 'c' as shown below

names(b) = unlist(a) 
names(c) = unlist(a)

list1 = lapply(b, function(x) do.call(rbind, x)) # rbind list elements
list2 = lapply(c, function(x) do.call(rbind, x))

Next cbind the both list1 and list2 elements considering the length of list elements in list1 and finally put the new list elements together using rbind

out = do.call(rbind, 
      lapply(names(list1), 
        function(x){ 
          cbind(id = x, 
             data.frame(list1[[x]]), 
             data.frame(list2[[x]])[1:nrow(list1[[x]]),])
      }))


#> out
#         id   type  link             x             y start_time end_time mode
#1   10000061   home 21258 334867.243653 3126570.70778   03:00:00 15:07:00 ride
#2   10000061   shop 13904  332634.86999 3127078.96383   15:12:00 16:21:00  car
#3   10000061   shop 14129 331666.364904 3129306.48785   16:25:00 17:37:00 ride
#NA  10000061   home 21258 334867.243653 3126570.70778   17:45:00 26:59:00 <NA>
#11  10000302   home 21256 334598.361546 3126269.05167   03:00:00 07:56:00  car
#21  10000302   work 14057 335957.065395 3128105.16619   08:04:00 10:28:00  car
#31  10000302 social 21191 333032.807855 3128759.66141   10:33:00 11:52:00  car
#4   10000302   home 21256 334598.361546 3126269.05167   11:59:00 12:11:00  car
#5   10000302 social 13906 332302.159169 3127536.46778   12:17:00 13:30:00  car
#NA1 10000302   home 21256 334598.361546 3126269.05167   13:36:00 26:59:00 <NA>
#    dep_time trav_time arr_time
#1   15:07:00  00:03:28 15:10:28
#2   16:21:00  00:09:02 16:30:02
#3   17:37:00  00:10:33 17:47:33
#NA      <NA>      <NA>     <NA>
#11  07:56:00  00:03:31 07:59:31
#21  10:28:00  00:06:47 10:34:47
#31  11:52:00  00:07:50 11:59:50
#4   12:11:00  00:04:49 12:15:49
#5   13:30:00  00:05:30 13:35:30
#NA1     <NA>      <NA>     <NA>
Veerendra Gadekar
  • 4,452
  • 19
  • 24