calculate max distance by group across millions of coordinates

Question

What is the most efficient way to calculate max distance between a set of coordinates by group in R?

Sample data: I have data like this, but rather than x10000 (which is for the example) the data I have has more like 25 million entries.

library(data.table)
data <- data.table(latitude=sample(seq(0,90,by=0.001), 10000, replace = TRUE),
               longitude=sample(seq(0,180,by=0.001), 10000, replace = TRUE))
groupn <- nrow(data)/1000
data$group <- sample(seq(1,groupn,by=1),10000,replace=T)

My current method is pretty slow:

data <- data[order(data$group),]
library(dplyr)
library(sf)
library(foreach)
distlist <- foreach(i=1:10)%do%{
  tempsf <- st_as_sf(filter(data,group==i), coords= c("longitude", "latitude"), crs=4326)
  max(st_distance(tempsf, tempsf))
  }

Can some genius out there help me speed this up?

Define your question more precisely. Are you looking for a maximal distance between two points in the set or maximal distance between two points that belong to two different sets? — mentallurg, May 29 '18 at 06:48

Juan Antonio Roldán Díaz · Accepted Answer · 2018-05-31T08:52:31.667

2

Try this:

Euclidean dist:

> system.time(out1 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2]))))
   user  system elapsed 
   0.14    0.00    0.14 
> out1
   1        2        3        4        5        6        7        8        9       10 
199.2716 197.1172 194.7018 197.2652 196.3747 197.6728 194.7344 197.8781 195.3837 195.0123

WGS84:

> auxF <- function(x) {
+   require(sp)
+   
+   tempsf <- data[x, 1:2]
+   coordinates(tempsf) <- c("longitude", "latitude")
+   proj4string(tempsf) = "+proj=longlat +ellps=WGS84 +no_defs"
+   return(max(spDists(tempsf)))
+ }
> 
> system.time(out2 <- tapply(1:nrow(data), data$group, auxF))
   user  system elapsed 
   4.71    0.00    4.76 
> out2
   1        2        3        4        5        6        7        8        9       10 
19646.04 19217.48 19223.27 19543.99 19318.55 18856.65 19334.11 19679.45 18840.90 19460.14

Haversine method:

> system.time(out3 <- tapply(1:nrow(data), data$group, function(x) max(distm(as.matrix(data[x,.(longitude,latitude)], fun=distHaversine)))))
   user  system elapsed 
  13.24    0.01   13.30 
> out3
   1        2        3        4        5        6        7        8        9       10 
19644749 19216989 19223012 19542956 19317958 18856273 19333424 19677917 18840641 19459353

For 7 million records you can assume a Euclidean distance or project your points to a plane so you can work with the Euclidean distance, since we know that the maximum distance is between the points of the convex hull of each group and this greatly reduces the operations and it does not require a lot of RAM:

> system.time(out4 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2][chull(data[x, 1:2]), ]))))
   user  system elapsed 
   0.03    0.00    0.03 
> out4
       1        2        3        4        5        6        7        8        9       10 
199.2716 197.1172 194.7018 197.2652 196.3747 197.6728 194.7344 197.8781 195.3837 195.0123

With big data:

> data <- data.table(latitude=sample(seq(0,90,by=0.001), 7000000, replace = TRUE),
+                    longitude=sample(seq(0,180,by=0.001), 7000000, replace = TRUE))
> groupn <- nrow(data)/700000
> data$group <- sample(seq(1,groupn,by=1),7000000,replace=T)
> 
> system.time(out1 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2]))))
Error: cannot allocate vector of size 1824.9 Gb
Called from: dist(data[x, 1:2])
Browse[1]> 
Timing stopped at: 7.81 0.06 7.91
> system.time(out4 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2][chull(data[x, 1:2]), ]))))
   user  system elapsed 
   8.41    0.22    8.64

edited May 31 '18 at 08:52

answered May 29 '18 at 08:32

Juan Antonio Roldán Díaz

2,017
8
22

I like where you are going with this, but it breaks my memory on 7 million coordinates with various groups. Error: vector memory exhausted (limit reached?), I could break it into chunks but any better ideas? – Neal Barsch May 31 '18 at 05:02
Also why the discrepancy in the values between the tapply and doing spDists? – Neal Barsch May 31 '18 at 05:38
```library(geosphere);out1 <- tapply(1:nrow(data), data$group, function(x) max(distm(as.matrix(data[x,c("longitude","latitude")], fun=distHaversine))))``` – Neal Barsch May 31 '18 at 06:20
You have small diferences because `spDists` work with WGS84 ellipsoid, `dist` is a euclidean distance and `distHaversine` use the haversine method who assumes that the earth is a great circle (by default with radius = 6378137 m). You must choose the method according to your needs, but keep in mind that the simplest methods are faster. – Juan Antonio Roldán Díaz May 31 '18 at 08:23
It's odd that an sf version of your auxF function is pretty slow (YOURS, IN SP, IS BETTER): ```auxSF <- function(x) { require(sf) tempsf <- data[x, 1:2] tempsf <- st_as_sf(tempsf, coords = c("longitude", "latitude"), crs = 4326) return(max(st_distance(tempsf))) } system.time(out2 <- tapply(1:nrow(data), data$group, auxSF))``` – Neal Barsch Jun 01 '18 at 09:24
What exactly is chull doing here and why does it speed it up so much? – Neal Barsch Jun 10 '18 at 01:02
1

The chull ([Convex Hull](https://en.wikipedia.org/wiki/Convex_hull)) is selecting the points that are at the edge of the data set (since the maximum difference must be found between the edge points). This is much faster because you only need to calculate the distance between the edge points, which is a very small subset. – Juan Antonio Roldán Díaz Jun 11 '18 at 07:24
I see. That chull function is brilliant. I was thinking this was super inefficient and also looking for a way to do this taking only the extents of a polygonize, chull is way better. Thanks! – Neal Barsch Jul 16 '18 at 02:27
Do you know how I could get the location in the data.table of the two farthest points from one another? – Neal Barsch Jul 25 '18 at 21:43
Try change the function for someting like this: `function(x) { chull <- chull(data[x, 1:2]); dd <- as.matrix(dist(data[chull, 1:2])); chull[which(dd == max(dd), arr.ind = TRUE)[, 1]] }` – Juan Antonio Roldán Díaz Jul 30 '18 at 07:03

Neal Barsch · Answer 2 · 2018-06-01T09:18:12.523

Thanks to Juan Antonio for the idea to use tapply. . . I ended up using the function into sp you built, it is the fastest.

auxF <- function(x) {
require(sp)
tempsf <- data[x, 1:2]
coordinates(tempsf) <- c("longitude", "latitude")
proj4string(tempsf) = "+proj=longlat +ellps=WGS84 +no_defs"
return(max(spDists(tempsf)))
}
out1 <- tapply(1:nrow(data), data$group, auxF)

This also works: dt.haversine that @SymbolixAU (awesome as usual) built:

dt.haversine <- function(lat_from, lon_from, lat_to, lon_to, r = 6378137){
  radians <- pi/180
  lat_to <- lat_to * radians
  lat_from <- lat_from * radians
  lon_to <- lon_to * radians
  lon_from <- lon_from * radians
  dLat <- (lat_to - lat_from)
  dLon <- (lon_to - lon_from)
  a <- (sin(dLat/2)^2) + (cos(lat_from) * cos(lat_to)) * (sin(dLon/2)^2)
  return(2 * atan2(sqrt(a), sqrt(1 - a)) * r)
}
library(geosphere)
out1 <- tapply(1:nrow(data), data$group, function(x) max(distm(as.matrix(data[x,c("longitude","latitude")], fun=dt.haversine))))

even `distHaversine` is quite slow (it has a matrix conversion step) and [can be re-written](https://stackoverflow.com/a/42014364/5977215) — SymbolixAU, May 31 '18 at 22:03

score 0 · Answer 3 · answered Sep 28 '20 at 18:27

Here is another way using data.table and .SD

> library(data.table)
> data <- data.table(
+   latitude=sample(seq(0,90,by=0.001), 10000, replace = TRUE),
+   longitude=sample(seq(0,180,by=0.001), 10000, replace = TRUE)
+ )
> groupn <- nrow(data)/1000
> data$group <- sample(seq(1,groupn,by=1),10000,replace=T)
> 
> way1 <- function() {
+   data[,
+     .(maxdist = max(
+       dist(
+         .SD[1:.N, .(latitude, longitude)]
+       )
+     )),
+     keyby = group
+   ]
+ }
> 
> way2 <- function() {
+   tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2])))
+ }
> 
> system.time(out1 <- way1())
   user  system elapsed 
   0.16    0.03    0.18 
> out1
    group  maxdist
 1:     1 196.7296
 2:     2 195.9555
 3:     3 196.0794
 4:     4 196.3476
 5:     5 195.2577
 6:     6 196.0791
 7:     7 198.5209
 8:     8 196.6944
 9:     9 195.2630
10:    10 194.4611
> 
> system.time(out1 <- way2())
   user  system elapsed 
   0.22    0.10    0.60 
> out1
       1        2        3        4        5        6        7        8        9       10 
196.7296 195.9555 196.0794 196.3476 195.2577 196.0791 198.5209 196.6944 195.2630 194.4611 
> 
> library(microbenchmark)
> microbenchmark(way1(), way2())
Unit: milliseconds
   expr      min       lq     mean   median       uq       max neval cld
 way1() 172.3232 231.3411 327.1674 266.9135 370.9586 1569.7742   100   a
 way2() 181.7716 228.1266 346.2764 285.8394 444.8963  800.4725   100   a

calculate max distance by group across millions of coordinates

3 Answers3