0

i have a XML file from which i extracted the paths to the items i want to grep from the XML.

If the provided URL does not work, one may find the XML here: Download XML

Like this:

library(XML)
dat <- read_xml(as.character("http://affi.voetbalshop.nl/google_create_unique.php"))
#dat_list <- dat %>% xml_find_all("//channel//item") %>% as_list()
dat_nodePaths <- dat %>% xml_find_all("//channel//item")
#dat_nodes <- dat %>% xml_find_all("//channel//item")
dat_paths <- xml_path(dat_nodePaths)

Now i want to extract each path one by one and add them to a DataFrame. If i apply it to only one of the paths, i get my DF with only 1 row off course, but it works.

l <- xml_find_first(dat, dat_paths[i]) %>% as_list()
df <- as.data.frame(t(unlist(l)), stringsAsFactors = F)

I have a previous try with xml_find_all() but this gives me less control what to do with each item in my XML.

My current try is this:

df <- rbind(for(i in 1:length(dat_paths)) {
  l <- xml_find_first(dat, dat_paths[i]) %>% as_list()
  as.data.frame(t(unlist(l)), stringsAsFactors = F)
  #df <- rbind(as.data.frame(t(unlist(l))))
})

Tried this with rbind within and outside of the for loop. Within gives me only the last object (obviously). The other only Null.

What can i do to obtain a nicely formatted DataFrame from my XML items?

Sander Van der Zeeuw
  • 1,092
  • 1
  • 13
  • 35

3 Answers3

1

Thanks to @mropa answer here: R list to data frame

I was able to build my dataframe:

create_df_from_xml <- function(url) {
  library(xml2)
  library(plyr)
  library(dplyr)
  dat <- read_xml(as.character(url))
  dat_list <- dat %>% xml_find_all("//channel//item") %>% as_list()
  dat_nodePaths <- dat %>% xml_find_all("//channel//item")
  dat_paths <- xml_path(dat_nodePaths)
  tst <- lapply(dat_list, function(x) {unlist(x)})
  tst2 <- sapply(tst, function(x){rbind(unlist(x))})
  df <- ldply(tst2,data.frame)
  return(df)
}

Hope this helps other people in the future!

Community
  • 1
  • 1
Sander Van der Zeeuw
  • 1,092
  • 1
  • 13
  • 35
1
library("httr")
library("XML")
URL <- "http://affi.voetbalshop.nl/google_create_unique.php"
temp <- tempfile(fileext = ".html")
GET(url = URL, user_agent("Mozilla/5.0"), write_disk(temp))

xpexpr <- "/rss/channel/item"
doc <- xmlParse( temp )
lNodes <- getNodeSet( doc, xpexpr )

a1 <- lapply(4:length(lNodes), function( y ) {
  xmlApply( xmlRoot( doc)[[1]][[y]], function(x) xmlSApply(x, xmlValue))
  })

b1 <- sapply( names(a1[[1]]), function( x ) t( sapply(a1, function( y ) y[[x]])))

names(b1)
# [1] "shipping"                "id"                      "title"                  
# [4] "description"             "product_type"            "google_product_category"
# [7] "link"                    "image_link"              "condition"              
# [10] "availability"            "price"                   "sale_price"             
# [13] "brand"                   "color"                   "age_group"              
# [16] "mpn"                     "item_group_id"           "gtin"                   
# [19] "custom_label_0"          "custom_label_1"          "custom_label_2"   

head(b1[['shipping']])
#      country service    price     
# [1,] "NL"    "Standard" "0.00 EUR"
# [2,] "NL"    "Standard" "0.00 EUR"
# [3,] "NL"    "Standard" "0.00 EUR"
# [4,] "NL"    "Standard" "0.00 EUR"
# [5,] "NL"    "Standard" "0.00 EUR"
# [6,] "NL"    "Standard" "0.00 EUR"
Sathish
  • 12,453
  • 3
  • 41
  • 59
-3

Try this

library(XML)

dat <- read_xml(as.character("http://affi.voetbalshop.nl/google_create_unique.php"))

doc<- xmlTreeParse(dat,useInternal=TRUE)

rootNode <- xmlRoot(doc)

xpathSApply(rootNode,"//channel//item",xmlValue)
pyll
  • 1,688
  • 1
  • 26
  • 44
  • Um… `xpathSApply` does not return a data frame as the OP asked and this code is not formatted at all. – hrbrmstr Feb 27 '17 at 11:36