I am trying to fix a bottleneck of a nested for loop in a function. I tried three functions already, and can't seem to fix it. Any help specially if it is a data.table or rcpp solution is highly appreciated. This is an example with a raster of 100 cells, but I have several of more than 1,000,000 cells so speed is of the essence.
Example
Consider the following raster:
library(raster)
r <- raster(nrows=10,ncols=10,xmn=-10,ymn=-10,xmx=10,ymx=10)
r[] <- rep(1, ncell(r))
extent(r) <- c(-10, 10, -10, 10)
This is a small raster, with only a hundred cells. I made the following function to study animal movement, where I use the Raster and a maximum distance animals can move to in a time slice. There is a bottleneck in a nested loop that I haven't been able to solve.
What I get in return is a dataframe with the following variables:
from The ID of the cell in the raster from which the animal can move from
to The ID of the cell in the raster from which the animal can move to
Dist The distance from cell from to cell to
load necessary libraries
library(gdistance)
library(dplyr)
library(tidyr)
DistConect1 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
#Start a list
connections <- list()
#For each pair of cells in B
for (i in 1:nrow(B)){
arcs <- list()
#Create a temporal raster for each row with the distance from cell xy to all other cells
temp <- accCost(h16,B[i,])
#Make all cells with distance to origin larger than maximum dispersal distance equal NA
temp[values(temp) > Distance] = NA
#Create a vector with only the ID raster values of cells that are not NA (To reduce the next loop)
index <- c(1:ncell(temp))[!is.na(values(temp))]
for (j in index){
#For each cell pair i,j generate a vector i,j, distance
arcs[[j]] <- c(i, j, temp[j])
}
#Gather all vectors in a data frame
connections[[i]] <- do.call("rbind", arcs)
#name columns
colnames(connections[[i]]) <- c("from", "to", "dist")
#This is just to see where I am in the function
print(paste(i, "of", nrow(B)))
}
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
#return connections
return(connections)
}
Tried to fix it with a lapply
But I got rid of only one of the loops
DistConect2 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
all.cells <- function(i){
arcs <- list()
temp <- accCost(h16, B[i, ])
temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
# all.index <- function(j){
for (j in index) {
arcs[[j]] <- c(i, j, temp[j])
}
# arcs <- lapply(index, all.index)
connections <- do.call("rbind", arcs)
# connections <- do.call("rbind", arcs)
colnames(connections) <- c("from", "to", "dist")
return(connections)
}
connections <- lapply(1:nrow(B), all.cells)
#For each pair of cells in B
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
#return connections
return(connections)
}
but using microbenchmarck package to test differences shows no difference at all:
microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), times = 4)
Result of microbenchmark
DistConect1(r, Distance = 1e+06) 10.283309 10.40662 10.55879 10.58380 10.71097 10.78428 4
DistConect2(r, Distance = 1e+06) 9.892371 10.07783 10.35453 10.41144 10.63124 10.70288 4
cld a a
Parallelization
I also tried parallelization but it actually takes longer:
DistConect2b <- function (Raster, Distance, cpus = NULL)
{
h16 <- transition(Raster, transitionFunction = function(x) {1}, 16, symm = FALSE)
h16 <- geoCorrection(h16, scl = FALSE)
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
all.cells <- function(i){
arcs <- list()
temp <- accCost(h16, B[i, ])
temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
# all.index <- function(j){
for (j in index) {
arcs[[j]] <- c(i, j, temp[j])
}
# arcs <- lapply(index, all.index)
connections <- do.call("rbind", arcs)
colnames(connections) <- c("from", "to", "dist")
return(connections)
# cat(paste(i, "of", nrow(B)))
}
require(snowfall)
sfInit(parallel=TRUE, cpus=cpus)
sfLibrary(gdistance)
sep.connections <- sfClusterApplyLB(1:nrow(B), all.cells)
sfStop(nostop=FALSE)
# sep.connections <- lapply(1:nrow(B), all.cells)
connections <- do.call("rbind", sep.connections)
connections <- as.data.frame(connections)
}
Results microbenchmark
microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), DistConect2b(r, Distance = 1000000, cpus = 2), times = 4)
expr min lq mean median uq
DistConect1(r, Distance = 1e+06) 10.145234 10.216611 10.35301 10.36512 10.48942
DistConect2(r, Distance = 1e+06) 9.963549 9.974315 10.01547 10.01173 10.05662
DistConnect2b(r, Distance = 1e+06, cpus = 2) 11.311966 11.486705 12.02240 11.81034 12.55809
More trials after the answer
After the great answer I got, I tried to go a step further and added a lapply to replace the for loop in the code:
DistConect4 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
all.cells <- function(i){
temp <- accCost2(h16, B[i, ])
index <- which(temp < 1000000) # all.index <- function(j){
connections <- cbind(i, index, temp[index])
return(connections)
}
connections <- lapply(1:nrow(B), all.cells)
connections <- as.data.frame(do.call("rbind", connections))
#Get everything together as a large data frame
colnames(connections) <- c("from", "to", "dist")
#return connections
return(connections)
}
Using the acccost2 function defined below
accCost2 <- function(x, fromCoords) {
fromCells <- cellFromXY(x, fromCoords)
tr <- transitionMatrix(x)
tr <- rBind(tr, rep(0, nrow(tr)))
tr <- cBind(tr, rep(0, nrow(tr)))
startNode <- nrow(tr)
adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
tr[adjP] <- Inf
adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}
But when I tried
timing <- microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), DistConnect2b(r, Distance = 1000000, cpus = 4), DistConect3(r, Distance = 1000000), DistConect4(r, Distance = 1000000) ,times = 20)
print(timing, unit = "relative")
it didn't make the process faster
expr min lq mean median uq
DistConect1(r, Distance = 1e+06) 12.400299 12.43078 12.407909 12.452043 12.502665
DistConect2(r, Distance = 1e+06) 12.238812 12.23194 12.168468 12.191067 12.155786
DistConnect2b(r, Distance = 1e+06, cpus = 4) 13.994594 14.01760 13.909674 13.978060 13.947062
DistConect3(r, Distance = 1e+06) 1.000000 1.00000 1.000000 1.000000 1.000000
DistConect4(r, Distance = 1e+06) 0.997329 1.00141 1.019697 1.002112 1.005626
I thought apply was always faster than for, any Idea why this dind't make the process any faster?