I have a code to parse a single xml file that relates to a data feed of football match. However, I have over 300+ games worth of data and I want to apply this code to all of these feeds as doing it manually by hand would take along time. I'm new to data science and although I have seen other posts about multiple XML parsing I don't really know to about changing the code so that it suits this data structure
library(XML)
library(plyr)
library(gdata)
library(reshape)
f24 <- file.choose() #XML FILE TO BE PARSED
grabAll <- function(XML.parsed, field){
parse.field <- xpathSApply(XML.parsed, paste("//", field, "[@*]", sep=""))
results <- t(sapply(parse.field, function(x) xmlAttrs(x)))
if(typeof(results)=="list"){
do.call(rbind.fill, lapply(lapply(results, t), data.frame,
stringsAsFactors=F))
} else {
as.data.frame(results, stringsAsFactors=F)
}
}
#Play-by-Play Parsing
pbpParse <- xmlInternalTreeParse(f24)
eventInfo <- grabAll(pbpParse, "Event")
eventParse <- xpathSApply(pbpParse, "//Event")
NInfo <- sapply(eventParse, function(x) sum(names(xmlChildren(x)) == "Q"))
QInfo <- grabAll(pbpParse, "Q")
EventsExpanded <- as.data.frame(lapply(eventInfo[,1:2], function(x) rep(x, NInfo)), stringsAsFactors=F)
QInfo <- cbind(EventsExpanded, QInfo)
names(QInfo)[c(1,3)] <- c("Eid", "Qid")
QInfo$value <- ifelse(is.na(QInfo$value), 1, QInfo$value)
Qual <- cast(QInfo, Eid ~ qualifier_id)
#FINAL DATA FOR ONE GAME
events <- merge(eventInfo, Qual, by.x="id", by.y="Eid", all.x=T, suffixes=c("", "Q"))