I am trying to parse out the content from XML file (more than 200,000 files, 800MB) using XML package in R and save them to the text file for further processing. However my laptop has only 4G RAM and R session always crashed while doing this. My code is as following, I have tried to use ldply(), rm(), and gc() after rm(). Yet the memory problem still existed. Can somebody point out my problems? Thank you very much!
# read the file names
file_list = list.files()
parseXml = function(filename) {
data = xmlTreeParse(filename, useInternalNodes = T)
for (i in 1:length(xpathApply(data, "//mesh_term", xmlValue)) ) {
tmp = data.frame("nct_id" = character(), "mesh_term" = character(),
stringsAsFactors = F)
# skip those trials data without mesh_term
if (length(xpathApply(data, "//mesh_term", xmlValue)) > 0) {
tmp[1, 1] = xpathApply(data, "//nct_id", xmlValue)[[1]]
tmp[1, 2] = xpathApply(data, "//mesh_term", xmlValue)[[i]]
}
}
return(tmp)
rm(tmp)
gc()
}
# chop file_list into 1000 sections and do
# 1000 iteration, I assume that this can save some memory (but useless)
n = 1000
for (i in 1:n) {
trialMesh = ldply(file_list[ (length(file_list)/n * (i-1) + 1) : (length(file_list)/n * i) ],
parseXml)
write.table(trialMesh, paste0("mypath/trialMesh_", i, ".txt"), sep="|",
eol="\n", quote=F, row.names = F, col.names = T)
rm(trialMesh)
gc()
}