Fixing bottleneck in function returning dataframe of distances between raster cells in R

Question

I am trying to fix a bottleneck of a nested for loop in a function. I tried three functions already, and can't seem to fix it. Any help specially if it is a data.table or rcpp solution is highly appreciated. This is an example with a raster of 100 cells, but I have several of more than 1,000,000 cells so speed is of the essence.

Example

Consider the following raster:

library(raster)


r   <- raster(nrows=10,ncols=10,xmn=-10,ymn=-10,xmx=10,ymx=10)
r[] <- rep(1, ncell(r))

extent(r) <- c(-10, 10, -10, 10)

This is a small raster, with only a hundred cells. I made the following function to study animal movement, where I use the Raster and a maximum distance animals can move to in a time slice. There is a bottleneck in a nested loop that I haven't been able to solve.

What I get in return is a dataframe with the following variables:

from The ID of the cell in the raster from which the animal can move from

to The ID of the cell in the raster from which the animal can move to

Dist The distance from cell from to cell to

load necessary libraries

library(gdistance)
library(dplyr)
library(tidyr)

DistConect1 <- function(Raster, Distance){
      #First we make a transition layer with the function transition from gdistance
      h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
      #Then geocorrect for projection
      h16   <- geoCorrection(h16, scl=FALSE)
      #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
      B <- xyFromCell(Raster, cell = 1:ncell(Raster))
      #This nested loop is where the Bottle neck is
      #Start a list
      connections <- list()
      #For each pair of cells in B
      for (i in 1:nrow(B)){
        arcs <- list()
        #Create a temporal raster for each row with the distance from cell xy to all other cells
        temp <- accCost(h16,B[i,])
        #Make all cells with distance to origin larger than maximum dispersal distance equal NA
        temp[values(temp) > Distance] = NA
        #Create a vector with only the ID raster values of cells that are not NA (To reduce the next loop)
        index <- c(1:ncell(temp))[!is.na(values(temp))]
        for (j in index){
          #For each cell pair i,j generate a vector i,j, distance
          arcs[[j]] <- c(i, j, temp[j])
        }
        #Gather all vectors in a data frame
        connections[[i]] <- do.call("rbind", arcs)
        #name columns
        colnames(connections[[i]]) <- c("from", "to", "dist")
        #This is just to see where I am in the function
        print(paste(i, "of", nrow(B)))
      }
      #Get everything together as a large data frame
      connections <- do.call("rbind", connections)
      connections <- as.data.frame(connections)
      #return connections 
      return(connections)
      }

Tried to fix it with a lapply

But I got rid of only one of the loops

DistConect2 <- function(Raster, Distance){
  #First we make a transition layer with the function transition from gdistance
  h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
  #Then geocorrect for projection
  h16   <- geoCorrection(h16, scl=FALSE)
  #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
  B <- xyFromCell(Raster, cell = 1:ncell(Raster))
  #This nested loop is where the Bottle neck is

  all.cells <- function(i){
    arcs <- list()
    temp <- accCost(h16, B[i, ])
    temp[values(temp) > Distance] = NA
    index <- c(1:ncell(temp))[!is.na(values(temp))]
    # all.index <- function(j){
    for (j in index) {
      arcs[[j]] <- c(i, j, temp[j])
    }
    # arcs <- lapply(index, all.index)
    connections <- do.call("rbind", arcs)
    # connections <- do.call("rbind", arcs)
    colnames(connections) <- c("from", "to", "dist")
    return(connections)
   }

  connections <- lapply(1:nrow(B), all.cells)
  #For each pair of cells in B
  #Get everything together as a large data frame
  connections <- do.call("rbind", connections)
  connections <- as.data.frame(connections)
  #return connections 
  return(connections)
}

but using microbenchmarck package to test differences shows no difference at all:

microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), times = 4)

Result of microbenchmark

DistConect1(r, Distance = 1e+06) 10.283309 10.40662 10.55879 10.58380 10.71097 10.78428     4
DistConect2(r, Distance = 1e+06)  9.892371 10.07783 10.35453 10.41144 10.63124 10.70288     4

cld a a

Parallelization

I also tried parallelization but it actually takes longer:

DistConect2b <- function (Raster, Distance, cpus = NULL) 
{
  h16 <- transition(Raster, transitionFunction = function(x) {1}, 16, symm = FALSE)
  h16 <- geoCorrection(h16, scl = FALSE)
  B <- xyFromCell(Raster, cell = 1:ncell(Raster))

  all.cells <- function(i){
    arcs <- list()
    temp <- accCost(h16, B[i, ])
    temp[values(temp) > Distance] = NA
    index <- c(1:ncell(temp))[!is.na(values(temp))]
    # all.index <- function(j){
     for (j in index) {
      arcs[[j]] <- c(i, j, temp[j])
    }
    # arcs <- lapply(index, all.index)
    connections <- do.call("rbind", arcs)

    colnames(connections) <- c("from", "to", "dist")
    return(connections)
    # cat(paste(i, "of", nrow(B)))
  }
  require(snowfall)
  sfInit(parallel=TRUE, cpus=cpus)
  sfLibrary(gdistance)
  sep.connections <- sfClusterApplyLB(1:nrow(B), all.cells)
  sfStop(nostop=FALSE)
  # sep.connections <- lapply(1:nrow(B), all.cells)
  connections <- do.call("rbind", sep.connections)
  connections <- as.data.frame(connections)

}

Results microbenchmark

microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000),  DistConect2b(r, Distance = 1000000, cpus = 2), times = 4)


                            expr       min        lq     mean   median       uq
             DistConect1(r, Distance = 1e+06) 10.145234 10.216611 10.35301 10.36512 10.48942
             DistConect2(r, Distance = 1e+06)  9.963549  9.974315 10.01547 10.01173 10.05662
 DistConnect2b(r, Distance = 1e+06, cpus = 2) 11.311966 11.486705 12.02240 11.81034 12.55809

More trials after the answer

After the great answer I got, I tried to go a step further and added a lapply to replace the for loop in the code:

DistConect4 <- function(Raster, Distance){
  #First we make a transition layer with the function transition from gdistance
  h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
   #Then geocorrect for projection
  h16   <- geoCorrection(h16, scl=FALSE)
  #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
  B <- xyFromCell(Raster, cell = 1:ncell(Raster))
  #This nested loop is where the Bottle neck is

  all.cells <- function(i){

    temp <- accCost2(h16, B[i, ])
    index <- which(temp < 1000000)  # all.index <- function(j){
    connections <- cbind(i, index, temp[index])

    return(connections)

  }

  connections <- lapply(1:nrow(B), all.cells)

  connections <- as.data.frame(do.call("rbind", connections))
  #Get everything together as a large data frame
  colnames(connections) <- c("from", "to", "dist")
  #return connections 
  return(connections)
}

Using the acccost2 function defined below

accCost2 <- function(x, fromCoords) {

  fromCells <- cellFromXY(x, fromCoords)
  tr <- transitionMatrix(x)
  tr <- rBind(tr, rep(0, nrow(tr)))
  tr <- cBind(tr, rep(0, nrow(tr)))
  startNode <- nrow(tr)
  adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
  tr[adjP] <- Inf
  adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
  E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
  return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}

But when I tried

timing <- microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000),  DistConnect2b(r, Distance = 1000000, cpus = 4), DistConect3(r, Distance = 1000000), DistConect4(r, Distance = 1000000) ,times = 20)

print(timing, unit = "relative")

it didn't make the process faster

                                         expr       min       lq      mean        median        uq
             DistConect1(r, Distance = 1e+06) 12.400299 12.43078 12.407909 12.452043 12.502665
             DistConect2(r, Distance = 1e+06) 12.238812 12.23194 12.168468 12.191067 12.155786
 DistConnect2b(r, Distance = 1e+06, cpus = 4) 13.994594 14.01760 13.909674 13.978060 13.947062
             DistConect3(r, Distance = 1e+06)  1.000000  1.00000  1.000000  1.000000  1.000000
             DistConect4(r, Distance = 1e+06)  0.997329  1.00141  1.019697  1.002112  1.005626

I thought apply was always faster than for, any Idea why this dind't make the process any faster?

My suggestion: Edit your question to focus only on vectorizing `all.cells` function and `connections <- lapply(1:nrow(B), all.cells)`. You have multiple `do.call("rbind"..)`, both in the function and in combining the output of the function, but `do.call("rbind"...)` is actually quite slow. There are `tidyverse` approaches to speed things up, but I can't quite figure what you're trying to do in `all.cells`. — CPak, Jul 17 '17 at 20:26

score 3 · Accepted Answer · answered Jul 17 '17 at 22:02

You can get rid of the inner loop by changing

temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
for (j in index){
  arcs[[j]] <- c(i, j, temp[j])
}
connections[[i]] <- do.call("rbind", arcs)
colnames(connections[[i]]) <- c("from", "to", "dist")

into this:

index <- which(temp < Distance)
connections[[i]] <- cbind(i, index, temp[index])

I also looked into accCost, which seems to be the slowest function here. Unfortunately, it calls some C code for finding the shortest paths which probably means there is little to optimize. I created accCost2 where I stripped some of the code but I doubt it matters much. I'm also not sure how effective parallelization will be here, since the runtime isn't that long. Below some benchmarks showing decent improvement.

library(gdistance)
library(dplyr)
library(tidyr)
library(raster)

r   <- raster(nrows=10,ncols=10,xmn=-10,ymn=-10,xmx=10,ymx=10)
r[] <- rep(1, ncell(r))

extent(r) <- c(-10, 10, -10, 10)

DistConect1 <- function(Raster, Distance){
    #First we make a transition layer with the function transition from gdistance
    h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
    #Then geocorrect for projection
    h16   <- geoCorrection(h16, scl=FALSE)
    #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
    B <- xyFromCell(Raster, cell = 1:ncell(Raster))
    #This nested loop is where the Bottle neck is
    #Start a list
    connections <- list()
    #For each pair of cells in B
    for (i in 1:nrow(B)){
        arcs <- list()
        #Create a temporal raster for each row with the distance from cell xy to all other cells
        temp <- accCost(h16,B[i,])
        #Make all cells with distance to origin larger than maximum dispersal distance equal NA
        temp[values(temp) > Distance] = NA
        #Create a vector with only the ID raster values of cells that are not NA (To reduce the next loop)
        index <- c(1:ncell(temp))[!is.na(values(temp))]
        for (j in index){
            #For each cell pair i,j generate a vector i,j, distance
            arcs[[j]] <- c(i, j, temp[j])
        }
        #Gather all vectors in a data frame
        connections[[i]] <- do.call("rbind", arcs)
        #name columns
        colnames(connections[[i]]) <- c("from", "to", "dist")
        #This is just to see where I am in the function
        # print(paste(i, "of", nrow(B)))
    }
    #Get everything together as a large data frame
    connections <- do.call("rbind", connections)
    connections <- as.data.frame(connections)
    #return connections 
    return(connections)
}

DistConect3 <- function(Raster, Distance){
    #First we make a transition layer with the function transition from gdistance
    h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
    #Then geocorrect for projection
    h16   <- geoCorrection(h16, scl=FALSE)
    #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
    B <- xyFromCell(Raster, cell = 1:ncell(Raster))
    #This nested loop is where the Bottle neck is
    #Start a list
    connections <- list()
    #For each pair of cells in B
    for (i in 1:nrow(B)){
        #Create a temporal raster for each row with the distance from cell xy to all other cells
        temp <- accCost2(h16,B[i,])
        index <- which(temp < Distance)
        connections[[i]] <- cbind(i, index, temp[index])
    }
    #Get everything together as a large data frame
    connections <- do.call("rbind", connections)
    connections <- as.data.frame(connections)
    colnames(connections) <- c("from", "to", "dist")
    #return connections 
    return(connections)
}

accCost2 <- function(x, fromCoords) {

    fromCells <- cellFromXY(x, fromCoords)
    tr <- transitionMatrix(x)
    tr <- rBind(tr, rep(0, nrow(tr)))
    tr <- cBind(tr, rep(0, nrow(tr)))
    startNode <- nrow(tr)
    adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
    tr[adjP] <- Inf
    adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
    E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
    return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}


d1 <- DistConect1(r, Distance = 1000)
d3 <- DistConect3(r, Distance = 1000)

# test float equality
all.equal(d1, d3, check.attributes = FALSE)
# TRUE

timing1 <- microbenchmark(
    DistConect1(r, Distance = 1000),
    DistConect3(r, Distance = 1000),
    times = 10
)
print(timing1, unit = "relative")
#   expr                            min      lq       mean     median   uq       max      neval cld
# 1 DistConect1(r, Distance = 1000) 2.077804 1.991303 1.881478 1.933114 1.951884 1.531302 10     b
# 2 DistConect3(r, Distance = 1000) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10    a

timing2 <- microbenchmark(
    DistConect1(r, Distance = 10000),
    DistConect3(r, Distance = 10000),
    times = 10
)
print(timing2, unit = "relative")
# expr                             min      lq       mean     median   uq       max         neval cld
# DistConect1(r, Distance = 10000) 2.018707 1.936773 1.966994 1.956694 1.964021 2.094569    10     b
# DistConect3(r, Distance = 10000) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    10    a

Thanks a lot @Vanderman, this went a lot faster, it when I used distances of 1000000, it acutally went 12 times faster than the original code, I am tried to make it faster by replacing the for by an apply, and it didnt go any faster, can you check that in my edit above? — Derek Corcoran, Jul 18 '17 at 12:23
Hi @DerekCorcoran, happy to hear it helped! Actually, the `apply` functions aren't really faster, they only exist to make code more readable. See for example [this link](https://stackoverflow.com/questions/2275896/is-rs-apply-family-more-than-syntactic-sugar?noredirect=1&lq=1). Aside from porting the code to to C/C++, I don't see much more room for optimization here. — Vandenman, Jul 18 '17 at 13:03