I'm trying to parse some site using Rcurl and XML packages. After ~1 houre of execution according task manager rsession process use 12G. After calling
rm(list = ls())
gc()
all my memory is still used by rsession. What is the problem? Here is the code:
library(XML);
library(RCurl);
rm(list = ls())
if (file.exists('cookies.txt')){
file.remove('cookies.txt')
}
if (file.exists('words.txt')){
file.remove('words.txt')
}
doGet <- function(url, encode = 'windows-1251') {
html <- tryCatch(
{
getURL(url, curl=curl, .encoding = encode)
}, warning = function(w) {
print(paste('warning: ', url, w))
}, error = function(e) {
print(paste('error: ', url, e))
}, finally = {}
)
write(x = html, file = '~tmp.html', append = F)
htmlTreeParse(file = '~tmp.html', useInternalNodes = T, encoding = encode)
}
makeURL <- function(url) {
paste(url_base, url, sep = "")
}
parse.morph <- function(n){
val <- xmlValue(n, encoding = 'UTF-8')
res <- tolower(gsub(" |-", "", strsplit(val, ':')[[1]][[2]]))
rm(val)
res
}
morphToList <- function(morphs) {
print(paste(morphs, collapse=''))
res <- list()
res$prefix = unlist(strsplit(morphs[1], split = ';'))
res$base = unlist(strsplit(morphs[2], split = ';'))
res$suffix = unlist(strsplit(morphs[3], split = ';'))
res$ending = unlist(strsplit(morphs[4], split = ';'))
res
}
indexOf <- function(val, str) {
grep(val, strsplit(str, "")[[1]])
}
parse.word <- function(page) {
xpathSApply(page, "//div[@class='word-article']/ul/li", parse.morph)
}
append <- function(df, m) {
tmp <- data.frame(p1 = m$prefix[3], p2 = m$prefix[2], p3 = m$prefix[1],
b1 = m$base[1], b2 = m$base[2],
s1 = m$suffix[1], s2 = m$suffix[2], s3 = m$suffix[3], s4 = m$suffix[4],
e1 = m$ending[1], e2 = m$ending[2], e3 = m$ending[3])
rbind(df, tmp)
}
parsePage <- function(page) {
words.url <- xpathSApply(page, "//tr[contains(@class, 'row')]/td/a", xmlGetAttr, 'href')
df <- data.frame(p1 = c(), p2 = c(), p3 = c(), b1 = c(), b2 = c(), s1 = c(), s2 = c(), s3 = c(), s4 = c(), e1 = c(), e2 = c(), e3 = c())
for(word.url in words.url) {
page <- doGet(makeURL(word.url))
word.morphs <- parse.word(page)
df <- append(df, morphToList(word.morphs))
}
return(df)
}
saveWords <- function(df, fileName) {
write.table(file = fileName, x = df, append = T, row.names = F, col.names = F, quote = T, sep = ',')
}
url_base <- 'http://slovonline.ru'
url_addr <- makeURL('/slovar_sostav')
agent<-"Mozilla/5.0"
curl<-getCurlHandle()
curlSetOpt(curl = curl, cookiejar='cookies.txt', useragent='Mozilla/5.0', followlocation=T)
index <- doGet(url_addr)
lrs.url <- xpathSApply(index, "//div[@class = 'nletters all']/a", xmlGetAttr, 'href')
for (letter in lrs.url[1:2]) {
page <- doGet(makeURL(letter))
table <- parsePage(page)
pages.url <- c(letter, xpathSApply(page, "//div[@class = 'npages']/a", xmlGetAttr, 'href'))
saveWords(df = table, fileName = 'words.csv')
for (page.url in pages.url) {
page <- doGet(makeURL(page.url))
table <- parsePage(page)
saveWords(df = table, fileName = 'words.csv')
}
}