0

I'm trying to parse some site using Rcurl and XML packages. After ~1 houre of execution according task manager rsession process use 12G. After calling

rm(list = ls())
gc()

all my memory is still used by rsession. What is the problem? Here is the code:

  library(XML);
  library(RCurl);
  rm(list = ls())
  if (file.exists('cookies.txt')){
        file.remove('cookies.txt')
  }
  if (file.exists('words.txt')){
        file.remove('words.txt')
  }

  doGet <- function(url, encode = 'windows-1251') {
        html <- tryCatch(
              {
                    getURL(url, curl=curl, .encoding = encode)
              }, warning = function(w) {
                    print(paste('warning: ', url, w))
              }, error = function(e) {
                    print(paste('error: ', url, e))
              }, finally = {}
        )
        write(x = html, file = '~tmp.html', append = F)
        htmlTreeParse(file = '~tmp.html', useInternalNodes = T, encoding = encode)
  }

  makeURL <- function(url) {
        paste(url_base, url, sep = "")
  }

  parse.morph <- function(n){
        val <- xmlValue(n, encoding = 'UTF-8')
        res <- tolower(gsub(" |-", "", strsplit(val, ':')[[1]][[2]]))
        rm(val)
        res
  }

  morphToList <- function(morphs) {
        print(paste(morphs, collapse=''))
        res <- list()
        res$prefix = unlist(strsplit(morphs[1], split = ';'))
        res$base =   unlist(strsplit(morphs[2], split = ';'))
        res$suffix = unlist(strsplit(morphs[3], split = ';'))
        res$ending = unlist(strsplit(morphs[4], split = ';'))
        res
  }

  indexOf <- function(val, str) {
        grep(val, strsplit(str, "")[[1]])
  }

  parse.word <- function(page) {
        xpathSApply(page, "//div[@class='word-article']/ul/li", parse.morph) 
  }

  append <- function(df, m) {
        tmp <- data.frame(p1 =  m$prefix[3], p2 =  m$prefix[2], p3 =  m$prefix[1], 
                          b1 = m$base[1],   b2 = m$base[2],  
                          s1 = m$suffix[1], s2 = m$suffix[2], s3 = m$suffix[3], s4 = m$suffix[4], 
                          e1 = m$ending[1], e2 = m$ending[2], e3 = m$ending[3])
        rbind(df, tmp)
  }

  parsePage <- function(page) {
        words.url <- xpathSApply(page, "//tr[contains(@class, 'row')]/td/a", xmlGetAttr, 'href') 
        df <- data.frame(p1 = c(), p2 = c(), p3 = c(), b1 = c(), b2 = c(),  s1 = c(), s2 = c(), s3 = c(), s4 = c(), e1 = c(), e2 = c(), e3 = c())
        for(word.url in words.url) {
              page <- doGet(makeURL(word.url))
              word.morphs <- parse.word(page)
              df <- append(df, morphToList(word.morphs))
        }
        return(df)
  }

  saveWords <- function(df, fileName) {
        write.table(file = fileName, x = df, append = T, row.names = F, col.names = F, quote = T, sep = ',')
  }

  url_base <- 'http://slovonline.ru'
  url_addr <- makeURL('/slovar_sostav')
  agent<-"Mozilla/5.0"

  curl<-getCurlHandle()
  curlSetOpt(curl = curl, cookiejar='cookies.txt', useragent='Mozilla/5.0', followlocation=T)


  index <- doGet(url_addr)
  lrs.url <- xpathSApply(index, "//div[@class = 'nletters all']/a", xmlGetAttr, 'href') 

  for (letter in lrs.url[1:2]) {
        page <- doGet(makeURL(letter))
        table <- parsePage(page)
        pages.url <- c(letter, xpathSApply(page, "//div[@class = 'npages']/a", xmlGetAttr, 'href'))
        saveWords(df = table, fileName = 'words.csv')
        for (page.url in pages.url) {
              page <- doGet(makeURL(page.url))
              table <- parsePage(page)
              saveWords(df = table, fileName = 'words.csv')
        }
  }
Dev
  • 117
  • 3
  • 11

1 Answers1

4

XML package is known to have memory management issues, as StackOverflow query reveals (examples here, here and here). Duncan Lang, author and maintainer of package, went as far as writing paper about memory usage issues.

You might try Hadley Wickham's xml2 package, which promises better memory management than XML. Personally I have not verified this claim.

Last time when I had to scrape large amount of web data (about 20k pages), I have decided to rewrite entire thing in Python. There was no xml2 back then.

Another approach I have tried was launching R script from shell loop. This way rsession process was stopped before it exhausted memory. It worked reasonably well, although was rather clumsy.

If you are interested, there is overview of algorithm:

  • In shell:
    • check is file with special-name exists
    • if it does not exist, run r on R script file
    • repeat
  • In R script
    • get sample of addresses from "yet-to-process" pool
    • if sample is empty ("yet-to-process" pool is exhausted) - create file with special-name (signal shell to stop execution) and finish
    • for each address: process it and remove it from "yet-to-process" pool
Community
  • 1
  • 1
Mirek Długosz
  • 4,205
  • 3
  • 24
  • 41