R: Optimizing "Overlap Calculations"

Question

Using R, I tried to create these two random shapefiles that represent two sets of "zones" within some city:

set.seed(123)
# Load the ggplot2 and sf packages
library(ggplot2)
library(sf)

# Set the longitude and latitude for New York
lon_1 <- -74.0060
lat_1 <- 40.7128

# Set the longitude and latitude for a second location
lon_2 <- -74.037
lat_2 <- 40.733

# Set the size of the square
size <- 0.1

# Create a square centered on New York
square_1 <- st_sfc(st_polygon(list(rbind(
    c(lon_1 - size/2, lat_1 - size/2),
    c(lon_1 + size/2, lat_1 - size/2),
    c(lon_1 + size/2, lat_1 + size/2),
    c(lon_1 - size/2, lat_1 + size/2),
    c(lon_1 - size/2, lat_1 - size/2)
))))

# Create a square centered on New York
square_2 <- st_sfc(st_polygon(list(rbind(
    c(lon_2 - size/2, lat_2 - size/2),
    c(lon_2 + size/2, lat_2 - size/2),
    c(lon_2 + size/2, lat_2 + size/2),
    c(lon_2 - size/2, lat_2 + size/2),
    c(lon_2 - size/2, lat_2 - size/2)
))))



# Set the number of mini rectangles
n <- 4

# Create random mini rectangles within the square
mini_rectangles_1 <- vector("list", n)
for (i in seq_len(n)) {
    x1 <- runif(1, lon_1 - size/2, lon_1)
    x2 <- runif(1, lon_1, lon_1 + size/2)
    y1 <- runif(1, lat_1 - size/2, lat_1)
    y2 <- runif(1, lat_1, lat_1 + size/2)
    mini_rectangles_1[[i]] <- st_polygon(list(rbind(
        c(x1, y1),
        c(x2, y1),
        c(x2,y2),
        c(x1,y2),
        c(x1,y1)
    )))
}
mini_rectangles_1 <- st_sfc(mini_rectangles_1)


mini_rectangles_2 <- vector("list", n)
for (i in seq_len(n)) {
    x1 <- runif(1, lon_2 - size/2, lon_2)
    x2 <- runif(1, lon_2, lon_2 + size/2)
    y1 <- runif(1,lat_2 - size/2,lat_2)
    y2 <- runif(1,lat_2,lat_2 + size/2)
    mini_rectangles_2[[i]] <- st_polygon(list(rbind(
        c(x1,y1),
        c(x2,y1),
        c(x2,y2),
        c(x1,y2),
        c(x1,y1)
    )))
}


mini_rectangles_2 <- st_sfc(mini_rectangles_2)

I then tried to visualize the results:

# Plot the squares and the mini rectangles
ggplot() +
    geom_sf(data = st_sf(square_1), fill = "white", color = "black") +
    geom_sf(data = st_sf(square_2), fill = "white", color = "black") +
    geom_sf(data = st_sf(mini_rectangles_1), fill = "red", color = "red", alpha = 0.5) +
    geom_sf(data = st_sf(mini_rectangles_2), fill = "blue", color = "blue", alpha = 0.5) +
    coord_sf(xlim = c(min(lon_1 - size*0.6, lon_2 - size*0.6), max(lon_1 + size*0.6, lon_2 + size*0.6)), ylim = c(min(lat_1 - size*0.6, lat_2 - size*0.6), max(lat_1 + size*0.6, lat_2 + size*0.6)))

My Question: I am now interested in determining what percent of each red square is covered by each blue square and what percent of each blue square is covered by each red square

I tried to find some similar questions e.g. How to compute all pairwise interaction between polygons and the the percentage coverage in R with sf? - based on these references, I then tried to modify the code to suit my requirements:

# Create a 4x4 matrix to store coverage percentages
n = 4
coverage_matrix <- matrix(0,n,n)

# Calculate coverage percentages for each pair of red and blue rectangles
for (i in seq_len(n)) {
    for (j in seq_len(n)) {
        intersection_area <- st_area(st_intersection(mini_rectangles_1[[i]], mini_rectangles_2[[j]]))
        red_area <- st_area(mini_rectangles_1[[i]])
        coverage_matrix[i,j] <- 100 * intersection_area / red_area
    }
}

# Print coverage matrix
print(coverage_matrix)


rownames(coverage_matrix) <- paste0("Red ", seq_len(n))
colnames(coverage_matrix) <- paste0("Blue ", seq_len(n))

# Print the modified coverage matrix
print(coverage_matrix)

The final result looks something like this:

        Blue 1   Blue 2    Blue 3    Blue 4
Red 1  0.00000  0.00000  1.696167 16.371502
Red 2  0.00000  0.00000 23.741099 26.089504
Red 3 81.49752 19.38099 21.055805 47.402298
Red 4 28.27181  0.00000  0.000000  3.646122

Assuming I have done this correctly - I am trying to figure out a way to improve the speed of this code. In real life, I have multiple blue squares and multiple red squares (e.g. over 1000) and the code I have written is taking a very long time to run. As such, I am looking for ways to improve the efficiency and run-time of my code.

Can someone please show me how to do this?

References:

How to compute all pairwise interaction between polygons and the the percentage coverage in R with sf?

Mikko Marttila · Accepted Answer · 2023-05-27T12:20:23.623

1

A little helper function is in order to find the areas of the intersecting regions and return them in a matrix. st_intersection() is vectorised to find the intersection geometries, but the result doesn’t maintain information about which of the input geometries form the resulting intersection, so we get that from st_intersects():

st_intersection_area <- function(x, y) {
  area <- st_area(st_intersection(x, y))
  intersects <- as.matrix(st_intersects(x, y))
  replace(intersects, which(intersects), area)
}

Now the question can be solved as follows:

area_r <- st_area(mini_rectangles_1)
area_b <- st_area(mini_rectangles_2)

st_intersection_area(mini_rectangles_1, mini_rectangles_2) / area_r
#>            [,1]      [,2]      [,3]       [,4]
#> [1,] 0.06997851 0.2976404 0.3385203 0.05587322
#> [2,] 0.00000000 0.6214937 0.6921818 0.00000000
#> [3,] 0.00000000 0.4742746 0.5933578 0.00000000
#> [4,] 0.00000000 0.2101857 0.2431918 0.00000000
st_intersection_area(mini_rectangles_2, mini_rectangles_1) / area_b
#>           [,1]       [,2]      [,3]      [,4]
#> [1,] 0.1195645 0.00000000 0.0000000 0.0000000
#> [2,] 0.6054833 0.08194228 0.1960124 0.3108706
#> [3,] 0.6773595 0.08976680 0.2412096 0.3537933
#> [4,] 0.1533763 0.00000000 0.0000000 0.0000000

To add progress indicators or to parallelize, use a loop and batch the inputs.

batch <- function(x, size) {
  (seq_along(x) - 1) %/% size
}

rect_1 <- make_rect(n = 400)
rect_2 <- make_rect(n = 400)

rect_2_batch <- split(rect_2, batch(rect_2, size = 100))
system.time({
  area_i <- do.call(cbind, purrr::map(rect_2_batch, function(x) {
    st_intersection_area(rect_1, x)
  }, .progress = "Calculating intersection area..."))
})
#> Calculating intersection area... ■■■■■■■■■ 25% | ETA: …Calculating intersection
#> area... ■■■■■■■■■■■■■■■■ 50% | ETA: …Calculating intersection area...
#> ■■■■■■■■■■■■■■■■■■■■■■■ 75% | ETA: …
#>    user  system elapsed 
#>    4.41    0.07    4.50

(Note that this is now very similar to some of your previous approaches, but the additional use of st_intersects() makes sure we get the correct results when there are empty intersections.)

edited May 27 '23 at 12:20

answered May 26 '23 at 23:15

Mikko Marttila

10,972
18
31

@ Mikko Marttila: Thank you so much for your answer! Do you think your answer will run faster than mine for larger datasets? – stats_noob May 26 '23 at 23:16
@stats_noob I tested your example with n = 100 and ended up with 9.2 s using the loop in your question and 0.3 s using my answer. So yes, I'd expect this to scale quite a bit better. – Mikko Marttila May 26 '23 at 23:37
@ Mikko: Thank you so much! the code you wrote: st_intersection_area(mini_rectangles_1, mini_rectangles_2) / area_r .... seems to be giving the same results as mine! – stats_noob May 26 '23 at 23:56
@ Mikko: I timed the differences in both approaches on my own computer. here are the results: Your Approach: Time difference of 0.2654738 secs .... My Approach: Time difference of 0.004674911 secs – stats_noob May 27 '23 at 00:24
@ Mikko Marttila: is it somehow possible to add a "print" statement to this code to see how much progress has been made? – stats_noob May 27 '23 at 01:55
Not directly. You could batch the inputs to add a loop in which you could print progress steps, but that will cost you a bit of performance due to the extra function calls. In this case that's probably warranted for a long running task though, as if done sparingly the printouts can clarify what's going on even if not printed very often. That's basically your Approach 1, but with correct output. – Mikko Marttila May 27 '23 at 10:03
@ Mikko Marttila: Thank you so much for your update! Currently, I am trying to figure out if there is a way to add row names and column names to the matrix produced in your answer – stats_noob May 27 '23 at 15:50
Just to clarify: # red square covering blue square st_intersection_area(mini_rectangles_1, mini_rectangles_2) / area_r – stats_noob May 27 '23 at 15:55
#blue square covering red square st_intersection_area(mini_rectangles_2, mini_rectangles_1) / area_b – stats_noob May 27 '23 at 15:55
You can of course set dimension names like with any other matrix. Looks like you got the proportion interpretations mixed up. – Mikko Marttila May 27 '23 at 19:42
Thank you so much! Can you please show me how to correct this just to make sure? – stats_noob May 27 '23 at 20:42
If you divide the intersection area by the area of the red square, you get the proportion of the red square covered by (not _covering_) the blue square, and vice versa. – Mikko Marttila May 27 '23 at 20:50

stats_noob · Answer 2 · 2023-05-27T15:49:05.857

Here are some different approaches I tried - however, I am not sure if these are really helping?:

Note: Some of these approaches produce numbers that are quite wrong!

Approach 1: Single Loop

coverage_matrix <- matrix(0, n, n)

for (i in seq_len(n)) {
  intersection_areas <- st_area(st_intersection(mini_rectangles_1[i], mini_rectangles_2))
  red_areas <- st_area(mini_rectangles_1[i])
  coverage_matrix[i, ] <- 100 * intersection_areas / red_areas
}


rownames(coverage_matrix) <- paste0("Red ", seq_len(n))
colnames(coverage_matrix) <- paste0("Blue ", seq_len(n))

print(coverage_matrix)

Approach 2: mcapply

library(parallel)

coverage_matrix <- matrix(0, nrow = n, ncol = n)

calculate_coverage <- function(i) {
    intersection_areas <- mclapply(seq_len(n), function(j) {
        st_area(st_intersection(mini_rectangles_1[[i]], mini_rectangles_2[[j]]))
    })
    red_areas <- st_area(mini_rectangles_1[[i]])
    100 * sapply(intersection_areas, sum) / red_areas
}

# do I need an mc.cores statement here?
coverage_list <- mclapply(seq_len(n), calculate_coverage)

coverage_matrix <- matrix(unlist(coverage_list), nrow = n, ncol = n, byrow = TRUE)


rownames(coverage_matrix) <- paste0("Red ", seq_len(n))
colnames(coverage_matrix) <- paste0("Blue ", seq_len(n))


print(coverage_matrix)

Approach 3: lapply

coverage_matrix <- matrix(0, n, n)

calculate_coverage <- function(i) {
  intersection_areas <- st_area(st_intersection(mini_rectangles_1[i], mini_rectangles_2))
  red_areas <- st_area(mini_rectangles_1[i])
  100 * intersection_areas / red_areas
}

coverage_list <- lapply(seq_len(n), calculate_coverage)

coverage_matrix <- do.call(rbind, coverage_list)



rownames(coverage_matrix) <- paste0("Red ", seq_len(n))
colnames(coverage_matrix) <- paste0("Blue ", seq_len(n))

print(coverage_matrix)

Approach 4: dopar

library(foreach)
library(doParallel)

num_cores <- parallel::detectCores()

cl <- makeCluster(num_cores)
registerDoParallel(cl)

clusterEvalQ(cl, {
    library(sf)
})

coverage_matrix <- matrix(0, n, n)

calculate_coverage <- function(i) {
    library(sf)  
    intersection_areas <- sf::st_area(sf::st_intersection(mini_rectangles_1[i], mini_rectangles_2))
    red_areas <- sf::st_area(mini_rectangles_1[i])
    100 * intersection_areas / red_areas
}

coverage_list <- foreach(i = seq_len(n), .combine = rbind) %dopar% {
    calculate_coverage(i)
}

stopCluster(cl)

coverage_matrix[] <- coverage_list

rownames(coverage_matrix) <- paste0("Red ", seq_len(n))
colnames(coverage_matrix) <- paste0("Blue ", seq_len(n))

print(coverage_matrix)

Approach 5: Functional Approach (numbers look really wrong)

library(sf)

calculate_coverage_vectorized <- function(mini_rectangles_1, mini_rectangles_2) {


  intersection_areas <- sf::st_area(sf::st_intersection(mini_rectangles_1, mini_rectangles_2))
  
  intersection_areas[is.na(intersection_areas)] <- 0
  

  red_areas <- sf::st_area(mini_rectangles_1)
  
  # Calculate the coverage percentages
  coverage_percentages <- 100 * intersection_areas / red_areas
  
  return(coverage_percentages)
}

coverage_list <- calculate_coverage_vectorized(mini_rectangles_1, mini_rectangles_2)

coverage_matrix[] <- coverage_list

rownames(coverage_matrix) <- paste0("Red ", seq_len(n))
colnames(coverage_matrix) <- paste0("Blue ", seq_len(n))

print(coverage_matrix)

References:

As `st_intersection()` returns only non-empty intersection geometries, most of these approaches produce an incorrect result. — Mikko Marttila, May 27 '23 at 10:14
@ Mikko Marttila: Thank you for your reply! Yes, I agree! I have noticed that the numbers in some of these approaches seem to be quite wrong :( — stats_noob, May 27 '23 at 15:48

R: Optimizing "Overlap Calculations"

2 Answers2