1

I'm using the raster package on an R server to process a large set (30000 files) of data files (10MB each).

For now, processing consists of parsing the data and subsequently rasterizing it via the rasterize function. The data is very sparse (only along roads) but has a high resolution and large extent. I've seen temporary files of 30GB for a raster created from one of the input files.

Because of the amount of files I'm using a foreach() %dopar% approach to processing the files, giving one file to each thread. I've set the raster options as follows:

rasterOptions(maxmemory = 15000000000)
rasterOptions(chunksize = 14000000000)
rasterOptions(todisk = TRUE)

This should come out to 15GB/thread * 32 threads = 480GB of RAM used at maximum for the rasters. Add some overhead and I would expect somewhere between 10GB to 20GB of the 512GB RAM to remain. However, that is not the case and I can't seem to figure out why. R gobbles up RAM until only 100MB to 2GB remain and only then seems to release previously allocated memory, only to be fed straight back into R for the next raster. I checked the RAM usage repeatedly over several hours to observe this.

I'm using SpatialPolygonDataFrames as input for rasterize, and suspected they might take up a lot of RAM as well. But when I checked their size, they were rather small, at about 100MB. Playing around with maxmemory, chunksize and only 16 threads also didn't seem to have any effect. I also looked at the rasterize source code to see if I find an explanation there, but that didn't get me far:

setMethod('rasterize', signature(x='SpatialPoints', y='Raster'), 
    function(x, y, field, fun='last', background=NA, mask=FALSE, update=FALSE, updateValue='all', filename="", na.rm=TRUE, ...){ 
    
        .pointsToRaster(x, y, field=field, fun=fun, background=background, mask=mask, update=update, updateValue=updateValue, filename=filename, na.rm=na.rm, ...)
    
    }
)

I have no clue where to find .pointsToRaster

Does anyone have an explanation for this behaviour or ideas for things to check? Did I simply overlook something? I´d like to not use the entire RAM so that other users can still work on the server. From what I understand my code should regulate how much RAM is used.

Here's the code I use:

library('iterators')
library('parallel')
library('foreach')
library('doParallel')

#init parallelisation
nCores = 32
cCluster = makeCluster(nCores, type = "FORK", outFile = "parseProcess")
registerDoParallel(cCluster)


foreach(j = 1:length(fileList)) %dopar%{
    #load all libraries for every thread
    library('sp')
    library('raster')
    library('spatial')
    library('gstat')
    library('rgdal')
    library('dismo')
    library('deldir')
    library('rgeos')
    library('sjmisc')
    #set rasteroptions per thread
    rasterOptions(maxmemory = 15000000000)
    rasterOptions(chunksize = 14000000000)
    rasterOptions(todisk = TRUE)
    tmpFolder = paste0("[PATH TO STORAGE]/rtmp",j)
    dir.create(tmpFolder)
    rasterOptions(tmpdir = tmpFolder)
  
    #generate names for raster files
    fileName = basename(fileList[j])
    print(paste("Processing:", fileName))
    rNameMax0 = sub(pattern = ".bin", replacement = "_scan0_max.tif", fileName)
    #repeat this for all 11 scans
  
    rasterStorage = "[PATH TO OTHER STORAGE]" #path to raster folder 
      
    scanList = parseFile(fileList[j]) #any memory allocated in this functions should be released on function return

    #create template raster
    bounds = as.vector(t(bbox(scanList$scan0)))
    resolution = c(0.0000566, 0.0000359)
    tmp = raster(xmn = bounds[1], xmx = bounds[2], ymn = bounds[3], ymx = bounds[4], res = resolution)
    
    #create rasters from data
        
    coordinates(scanList$scan0) = ~Long+Lat
    proj4string(scanList$scan0) = WGS84CRS
    rScanMax0 = rasterize(scanList$scan0, tmp, fun = 'max', filename = paste0(rasterStorage, rNameMax0))
    rm('rScanMax0')
        
    #repeat for scans 1 to 4
        
    removeTmpFiles(h = 0.2)
    unlink(tmpFolder, recursive = TRUE, force = TRUE)
    dir.create(tmpFolder)
    rasterOptions(tmpdir = tmpFolder)
        
    coordinates(scanList$scan5) = ~Long+Lat
    proj4string(scanList$scan5) = WGS84CRS
    rScanMax5 = rasterize(scanList$scan5, tmp, fun = 'max', filename = paste0(rasterStorage, rNameMax5))
    rm('rScanMax5')

    #repeat for scans 6 to 10
    
    removeTmpFiles(h = 0.2)
    unlink(tmpFolder, recursive = TRUE, force = TRUE)
}

stopCluster(cCluster)

Here's the (gutted) code of the parseFile function:

parseFile = function(fileName){

con = file(fileName, "rb")
    intSize = 4
    fileEndian = "little"
    
    #create data frames for each scan
    scan0 = data.frame(matrix(ncol = n1, nrow = 0))
    colnames(scan0) = c("Lat", "Long", ...)
    scan1 = data.frame(matrix(ncol = n2, nrow = 0))
    colnames(scan1) = c("Lat", "Long", ...)
    scan2 = data.frame(matrix(ncol = n3, nrow = 0))
    colnames(scan2) = c("Lat", "Long", ...)
    scan3 = data.frame(matrix(ncol = n4, nrow = 0))
    colnames(scan3) = c("Lat", "Long", ...)
    scan4 = data.frame(matrix(ncol = n5, nrow = 0))
    colnames(scan4) = c("Lat", "Long", ...)
    scan5 = data.frame(matrix(ncol = n6, nrow = 0))
    colnames(scan5) = c("Lat", "Long", ...)
    scan6 = data.frame(matrix(ncol = n7, nrow = 0))
    colnames(scan6) = c("Lat", "Long", ...)
    scan7 = data.frame(matrix(ncol = n8, nrow = 0))
    colnames(scan7) = c("Lat", "Long", ...)
    scan8 = data.frame(matrix(ncol = n9, nrow = 0))
    colnames(scan8) = c("Lat", "Long", ...)
    scan9 = data.frame(matrix(ncol = n10, nrow = 0))
    colnames(scan9) = c("Lat", "Long", ...)
    scan10 = data.frame(matrix(ncol = n11, nrow = 0))
    colnames(scan10) = c("Lat", "Long", ...)
    
    header = readBin(con, raw(), n = 36)
    
    i = 1
    
    while(i){
        blockHeader = readBin(con, integer(), n = 3, size = intSize, endian = fileEndian) 
        if(...){ #check whether file ended
            break
        }
        i = i + 1
        
        #sort data to correct scan, assign GPS tag 

        blockTrailer = readBin(con, raw(), n = 8)
    }
    #clean up
    close(con)
    
    #return parsed data
    returnList = list("scan0" = scan0, "scan1" = scan1, "scan2" = scan2, "scan3" = scan3, "scan4" = scan4, 
            "scan5" = scan5, "scan6" = scan6, "scan7" = scan7, "scan8" = scan8, "scan9" = scan9, "scan10" = scan10)
    return(returnList)
}

I'm also looking at the solutions posted here as another approach, but I'd still like to know why my code doesn't work as I expect it to.

Konrad Rudolph
  • 530,221
  • 131
  • 937
  • 1,214
N. Plu
  • 11
  • 3
  • can you provide a smaller example? E.g. one raster object with the same extent and resolution as your data and the same dimensions and an example vector (all created with code)? – Robert Hijmans Jan 04 '21 at 15:37
  • Can you elaborate a bit? I'm not sure what exactly you are looking for. The rasters themselves are fine, the code does what it's supposed to. The entire process is just using more memory than I would expect after a few iterations. – N. Plu Jan 06 '21 at 08:34
  • The short answer is that it is expected with R questions that you provide a *minimal, self contained, reproducible example*. You show a lot of code, much of it seemingly irrelevant to the problem at hand. Of course there is more complexity given the clustering, but to solve that you need to break down your code into smaller pieces to be able to assess what is going on. – Robert Hijmans Jan 06 '21 at 12:39
  • Naturally, I tried smaller sections of the code to try to narrow down the issue, but I wasn't able to trace the changed RAM usage back to precisely my code. Nevertheless, I will try to come up with a self contained example. Might take some time though to be sure it's reprobucible. – N. Plu Jan 07 '21 at 13:45
  • I ended up using `rlimit` to restrict the memory usage and it works really well, so I won't spend more time on this, but thank you for your time anyways. – N. Plu Jan 12 '21 at 08:14

0 Answers0