0

I have a code to parse a single xml file that relates to a data feed of football match. However, I have over 300+ games worth of data and I want to apply this code to all of these feeds as doing it manually by hand would take along time. I'm new to data science and although I have seen other posts about multiple XML parsing I don't really know to about changing the code so that it suits this data structure

library(XML)
library(plyr)
library(gdata)
library(reshape)

f24 <- file.choose() #XML FILE TO BE PARSED

grabAll <- function(XML.parsed, field){ 
   parse.field <- xpathSApply(XML.parsed, paste("//", field, "[@*]", sep=""))
   results <- t(sapply(parse.field, function(x) xmlAttrs(x)))
   if(typeof(results)=="list"){
   do.call(rbind.fill, lapply(lapply(results, t), data.frame,  
   stringsAsFactors=F))
} else {
   as.data.frame(results, stringsAsFactors=F)
}
}

#Play-by-Play Parsing
 pbpParse <- xmlInternalTreeParse(f24)
 eventInfo <- grabAll(pbpParse, "Event")
 eventParse <- xpathSApply(pbpParse, "//Event")
 NInfo <- sapply(eventParse, function(x) sum(names(xmlChildren(x)) == "Q"))
 QInfo <- grabAll(pbpParse, "Q")
 EventsExpanded <- as.data.frame(lapply(eventInfo[,1:2], function(x) rep(x, NInfo)), stringsAsFactors=F)
 QInfo <- cbind(EventsExpanded, QInfo)
 names(QInfo)[c(1,3)] <- c("Eid", "Qid")
 QInfo$value <- ifelse(is.na(QInfo$value), 1, QInfo$value)
 Qual <- cast(QInfo, Eid ~ qualifier_id)

 #FINAL DATA FOR ONE GAME
 events <- merge(eventInfo, Qual, by.x="id", by.y="Eid", all.x=T, suffixes=c("", "Q"))

Example of the data feed

hrbrmstr
  • 77,368
  • 11
  • 139
  • 205
L.England
  • 153
  • 1
  • 1
  • 6

0 Answers0