I'm using the raster package on an R server to process a large set (30000 files) of data files (10MB each).
For now, processing consists of parsing the data and subsequently rasterizing it via the rasterize
function.
The data is very sparse (only along roads) but has a high resolution and large extent. I've seen temporary files of 30GB for a raster created from one of the input files.
Because of the amount of files I'm using a foreach() %dopar%
approach to processing the files, giving one file to each thread. I've set the raster options as follows:
rasterOptions(maxmemory = 15000000000)
rasterOptions(chunksize = 14000000000)
rasterOptions(todisk = TRUE)
This should come out to 15GB/thread * 32 threads = 480GB of RAM used at maximum for the rasters. Add some overhead and I would expect somewhere between 10GB to 20GB of the 512GB RAM to remain. However, that is not the case and I can't seem to figure out why. R gobbles up RAM until only 100MB to 2GB remain and only then seems to release previously allocated memory, only to be fed straight back into R for the next raster. I checked the RAM usage repeatedly over several hours to observe this.
I'm using SpatialPolygonDataFrames
as input for rasterize
, and suspected they might take up a lot of RAM as well. But when I checked their size, they were rather small, at about 100MB. Playing around with maxmemory
, chunksize
and only 16 threads also didn't seem to have any effect.
I also looked at the rasterize
source code to see if I find an explanation there, but that didn't get me far:
setMethod('rasterize', signature(x='SpatialPoints', y='Raster'),
function(x, y, field, fun='last', background=NA, mask=FALSE, update=FALSE, updateValue='all', filename="", na.rm=TRUE, ...){
.pointsToRaster(x, y, field=field, fun=fun, background=background, mask=mask, update=update, updateValue=updateValue, filename=filename, na.rm=na.rm, ...)
}
)
I have no clue where to find .pointsToRaster
Does anyone have an explanation for this behaviour or ideas for things to check? Did I simply overlook something? I´d like to not use the entire RAM so that other users can still work on the server. From what I understand my code should regulate how much RAM is used.
Here's the code I use:
library('iterators')
library('parallel')
library('foreach')
library('doParallel')
#init parallelisation
nCores = 32
cCluster = makeCluster(nCores, type = "FORK", outFile = "parseProcess")
registerDoParallel(cCluster)
foreach(j = 1:length(fileList)) %dopar%{
#load all libraries for every thread
library('sp')
library('raster')
library('spatial')
library('gstat')
library('rgdal')
library('dismo')
library('deldir')
library('rgeos')
library('sjmisc')
#set rasteroptions per thread
rasterOptions(maxmemory = 15000000000)
rasterOptions(chunksize = 14000000000)
rasterOptions(todisk = TRUE)
tmpFolder = paste0("[PATH TO STORAGE]/rtmp",j)
dir.create(tmpFolder)
rasterOptions(tmpdir = tmpFolder)
#generate names for raster files
fileName = basename(fileList[j])
print(paste("Processing:", fileName))
rNameMax0 = sub(pattern = ".bin", replacement = "_scan0_max.tif", fileName)
#repeat this for all 11 scans
rasterStorage = "[PATH TO OTHER STORAGE]" #path to raster folder
scanList = parseFile(fileList[j]) #any memory allocated in this functions should be released on function return
#create template raster
bounds = as.vector(t(bbox(scanList$scan0)))
resolution = c(0.0000566, 0.0000359)
tmp = raster(xmn = bounds[1], xmx = bounds[2], ymn = bounds[3], ymx = bounds[4], res = resolution)
#create rasters from data
coordinates(scanList$scan0) = ~Long+Lat
proj4string(scanList$scan0) = WGS84CRS
rScanMax0 = rasterize(scanList$scan0, tmp, fun = 'max', filename = paste0(rasterStorage, rNameMax0))
rm('rScanMax0')
#repeat for scans 1 to 4
removeTmpFiles(h = 0.2)
unlink(tmpFolder, recursive = TRUE, force = TRUE)
dir.create(tmpFolder)
rasterOptions(tmpdir = tmpFolder)
coordinates(scanList$scan5) = ~Long+Lat
proj4string(scanList$scan5) = WGS84CRS
rScanMax5 = rasterize(scanList$scan5, tmp, fun = 'max', filename = paste0(rasterStorage, rNameMax5))
rm('rScanMax5')
#repeat for scans 6 to 10
removeTmpFiles(h = 0.2)
unlink(tmpFolder, recursive = TRUE, force = TRUE)
}
stopCluster(cCluster)
Here's the (gutted) code of the parseFile function:
parseFile = function(fileName){
con = file(fileName, "rb")
intSize = 4
fileEndian = "little"
#create data frames for each scan
scan0 = data.frame(matrix(ncol = n1, nrow = 0))
colnames(scan0) = c("Lat", "Long", ...)
scan1 = data.frame(matrix(ncol = n2, nrow = 0))
colnames(scan1) = c("Lat", "Long", ...)
scan2 = data.frame(matrix(ncol = n3, nrow = 0))
colnames(scan2) = c("Lat", "Long", ...)
scan3 = data.frame(matrix(ncol = n4, nrow = 0))
colnames(scan3) = c("Lat", "Long", ...)
scan4 = data.frame(matrix(ncol = n5, nrow = 0))
colnames(scan4) = c("Lat", "Long", ...)
scan5 = data.frame(matrix(ncol = n6, nrow = 0))
colnames(scan5) = c("Lat", "Long", ...)
scan6 = data.frame(matrix(ncol = n7, nrow = 0))
colnames(scan6) = c("Lat", "Long", ...)
scan7 = data.frame(matrix(ncol = n8, nrow = 0))
colnames(scan7) = c("Lat", "Long", ...)
scan8 = data.frame(matrix(ncol = n9, nrow = 0))
colnames(scan8) = c("Lat", "Long", ...)
scan9 = data.frame(matrix(ncol = n10, nrow = 0))
colnames(scan9) = c("Lat", "Long", ...)
scan10 = data.frame(matrix(ncol = n11, nrow = 0))
colnames(scan10) = c("Lat", "Long", ...)
header = readBin(con, raw(), n = 36)
i = 1
while(i){
blockHeader = readBin(con, integer(), n = 3, size = intSize, endian = fileEndian)
if(...){ #check whether file ended
break
}
i = i + 1
#sort data to correct scan, assign GPS tag
blockTrailer = readBin(con, raw(), n = 8)
}
#clean up
close(con)
#return parsed data
returnList = list("scan0" = scan0, "scan1" = scan1, "scan2" = scan2, "scan3" = scan3, "scan4" = scan4,
"scan5" = scan5, "scan6" = scan6, "scan7" = scan7, "scan8" = scan8, "scan9" = scan9, "scan10" = scan10)
return(returnList)
}
I'm also looking at the solutions posted here as another approach, but I'd still like to know why my code doesn't work as I expect it to.