5

Please look at the following small working example:

#### Pseudo data
nobs1 <- 4000
nobs2 <- 5000
mylon1 <- runif(nobs1, min=0, max=1)-76
mylat1 <- runif(nobs1, min=0, max=1)+37
mylon2 <- runif(nobs2, min=0, max=1)-76
mylat2 <- runif(nobs2, min=0, max=1)+37

#### define a distance function
thedistance <- function(lon1, lat1, lon2, lat2) {
 R <- 6371 # Earth mean radius [km]
 delta.lon <- (lon2 - lon1)
 delta.lat <- (lat2 - lat1)
 a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
 c <- 2 * asin(min(1,sqrt(a)))
 d = R * c
 return(d)
}

ptm <- proc.time()

#### Calculate distances between locations
# Initiate the resulting distance vector
ndistance <- nobs1*nobs2 # The number of distances
mydistance <- vector(mode = "numeric", length = ndistance)

k=1
for (i in 1:nobs1) {
 for (j in 1:nobs2) {
  mydistance[k] = thedistance(mylon1[i],mylat1[i],mylon2[j],mylat2[j])
  k=k+1
 }
}

proc.time() - ptm

The computation time:

  user  system elapsed 
249.85    0.16  251.18

Here, my question is whether there is still room for speeding up the double for-loop calculation. Thank you very much.

Bill TP
  • 1,037
  • 2
  • 16
  • 29
  • Why 4000 and 5000 observations? – ECII Dec 06 '14 at 07:16
  • They are just randomly chosen for the illustration. – Bill TP Dec 06 '14 at 07:23
  • OK but should they necessarily be different numbers? – ECII Dec 06 '14 at 07:27
  • Oh. Those should be different for my original problem. Thanks for checking. – Bill TP Dec 06 '14 at 07:30
  • I think you can use one of the apply family functions to speed up your solution, however it requires a matrix or a data.frame wirh all possible combinations of distances. I have written a solution with the apply function, however the construction of the combined data.frame takes forever. – ECII Dec 06 '14 at 08:08
  • The most important fact I've learnt about R is that 'if you're using a for-loop you're probably doing it wrong'. – tospig Dec 06 '14 at 21:37

5 Answers5

5

Here's an option that decreases the runtime to ~2 seconds on my machine because part of it is vectorized.

A direct comparison with the original solution follows.

Test data:

nobs1 <- 4000
nobs2 <- 5000
mylon1 <- runif(nobs1, min=0, max=1)-76
mylat1 <- runif(nobs1, min=0, max=1)+37
mylon2 <- runif(nobs2, min=0, max=1)-76
mylat2 <- runif(nobs2, min=0, max=1)+37

Original solution:

#### define a distance function
thedistance <- function(lon1, lat1, lon2, lat2) {
  R <- 6371 # Earth mean radius [km]
  delta.lon <- (lon2 - lon1)
  delta.lat <- (lat2 - lat1)
  a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
  c <- 2 * asin(min(1,sqrt(a)))
  d = R * c
  return(d)
}

ptm <- proc.time()

#### Calculate distances between locations
# Initiate the resulting distance vector
ndistance <- nobs1*nobs2 # The number of distances
mydistance <- vector(mode = "numeric", length = ndistance)

k=1
for (i in 1:nobs1) {
  for (j in 1:nobs2) {
    mydistance[k] = thedistance(mylon1[i],mylat1[i],mylon2[j],mylat2[j])
    k=k+1
  }
}

proc.time() - ptm
   User      System     elapsed 
148.243       0.681     148.901 

My approach:

# modified (vectorized) distance function:
thedistance2 <- function(lon1, lat1, lon2, lat2) {
  R <- 6371 # Earth mean radius [km]
  delta.lon <- (lon2 - lon1)
  delta.lat <- (lat2 - lat1)
  a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
  c <- 2 * asin(pmin(1,sqrt(a)))   # pmin instead of min
  d = R * c
  return(d)
}

ptm2 <- proc.time()

lst <- vector("list", length = nobs1)

for (i in seq_len(nobs1)) {
    lst[[i]] = thedistance2(mylon1[i],mylat1[i],mylon2,mylat2)
}

res <- unlist(lst)

proc.time() - ptm2
   User      System     elapsed
  1.988       0.331       2.319 

Are the results all equal?

all.equal(mydistance, res)
#[1] TRUE
talat
  • 68,970
  • 21
  • 126
  • 157
  • well that's beautiful. – ECII Dec 06 '14 at 08:31
  • Simply amazing. By the way, would you let me know your laptop/desktop's hardware spec? The elapsed time of mine was 7.05 following your approach. – Bill TP Dec 06 '14 at 09:48
  • mine is also 7 sec. I curious too. – ECII Dec 06 '14 at 09:56
  • Here: Lenovo ThinkPad x230 i3-3120M 16GB RAM with a SSD SATA III – Bill TP Dec 06 '14 at 10:01
  • Mac...the undeniable killer machine. – Bill TP Dec 06 '14 at 10:04
  • One more tiny improvement you could make: put `R <- 6371 # Earth mean radius [km]` outside the distance function so that it's not repeated in each loop (doesnt change). – talat Dec 06 '14 at 10:16
  • Take out `return` and see if that makes any difference. In fact the second to last line could be the last line – Rich Scriven Dec 06 '14 at 21:38
  • Are you able to test/benchmark my fully vectorised solution as my laptop can't handle large matrices? – tospig Dec 06 '14 at 22:15
  • I can't currently test it. But as it currently stands, it's not fully vectorized since you use sapply twice (which is essentially a for loop). On the other hand, I think those sapply's are not necessary since you are just subtracting vectors. Maybe better double check it. @tospig – talat Dec 06 '14 at 22:42
  • Ah right. The vectors are different lengths though so I didn't think they could simply be subtracted? – tospig Dec 06 '14 at 23:01
2

Here is an other approach using Rcpp. On my machine it was slightly faster than beginneR's very nice vectorized version.

library(Rcpp)

nobs1 <- 4000
nobs2 <- 5000
mylon1 <- runif(nobs1, min=0, max=1)-76
mylat1 <- runif(nobs1, min=0, max=1)+37
mylon2 <- runif(nobs2, min=0, max=1)-76
mylat2 <- runif(nobs2, min=0, max=1)+37

sourceCpp("dist.cpp")

system.time({
  lst <- vector("list", length = nobs1)
  for (i in seq_len(nobs1)) {
    lst[[i]] = thedistance2(mylon1[i],mylat1[i],mylon2,mylat2)
  }
  res <- unlist(lst)
})

##  user  system elapsed 
## 4.636   0.084   4.737 


system.time(res2 <- dist_vec(mylon1, mylat1, mylon2, mylat2))
##  user  system elapsed 
## 2.584   0.044   2.633 

all.equal(res, res2)
## TRUE 

And the file dist.cpp countains

#include <Rcpp.h>
#include <iostream>
#include <math.h>

using namespace Rcpp;

double dist_cpp(double lon1, double lat1,
                double lon2, double lat2){
  int R = 6371; 
  double delta_lon = (lon2 - lon1);
  double delta_lat = (lat2 - lat1);
  double a = pow(sin(delta_lat/2.0), 2) + cos(lat1) * cos(lat2) * pow(sin(delta_lon/2.0), 2);
  double c;
  a = sqrt(a);
  if (a < 1.0) {
    c = 2 * asin(a);
  } else {
    c = 2 * asin(1);
  }
  double d = R * c;
  return(d);

}

// [[Rcpp::export]]
NumericVector dist_vec(NumericVector lon1, NumericVector lat1, 
                       NumericVector lon2, NumericVector lat2) {
  int n = lon1.size();
  int m = lon2.size();
  int k = n * m;
  NumericVector res(k);
  int c = 0;
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < m; j++) {
      res[c] = dist_cpp(lon1[i], lat1[i], lon2[j], lat2[j]);
      c++;
    }
  }
  return(res);
}
johannes
  • 14,043
  • 5
  • 40
  • 51
1

I ran this 3 times, once as you have here, once loading up library compiler (in base R) and running enableJIT(3), and running (with enableJIT) after compiling your thedistance function with cmpfun() (in compiler). The results are (respectively):

> proc.time() - ptm
   user  system elapsed 
 335.26    0.17  339.83

> proc.time() - ptm
   user  system elapsed 
 120.73    0.12  122.52 

> proc.time() - ptm
   user  system elapsed 
 117.86    0.12  118.95 

The difference with 2 and 3 is neglible, as I think enableJIT just compiles the function anyway, but it looks like there is a decent improvement with enableJIT alone. so just throw

library(compiler)

enableJIT(3)

at the top of your code.

More information can be found here

goodtimeslim
  • 880
  • 7
  • 13
1
library(compiler)
enableJIT(3)

nobs1 <- 4000
nobs2 <- 4000
mylon1 <- runif(nobs1, min=0, max=1)-76
mylat1 <- runif(nobs1, min=0, max=1)+37
mylon2 <- runif(nobs2, min=0, max=1)-76
mylat2 <- runif(nobs2, min=0, max=1)+37

trixie<-matrix(nrow=length(mylon1),ncol=4)
trixie[,1]<-mylon1
trixie[,2]<-mylat1
trixie[,3]<-mylon2
trixie[,4]<-mylat2

#### define a distance function
thedistance <- function(lon1, lat1, lon2, lat2) {
  R <- 6371 # Earth mean radius [km]
  delta.lon <- (lon2 - lon1)
  delta.lat <- (lat2 - lat1)
  a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.lon/2)^2
  c <- 2 * asin(min(1,sqrt(a)))
  d = R * c
  return(d)
}
calc_distance <- cmpfun(thedistance)
ptm <- proc.time()

#### Calculate distances between locations
# Initiate the resulting distance vector
distances<-apply(trixie,1, function(x) {
  return(apply(trixie,1, function(y) {
    return(calc_distance(x[1],x[2],y[3],y[4]))
  }))
})

proc.time() - ptm
Matt Sandy
  • 187
  • 1
  • 8
  • The OP in the comments wants nobs1 different from nobs2. So its a combination of all possible nobs1 and nobs2. So instead of 4000 combinations the OP requires 4000*5000 combinations. Thats why its under a second in your answer :) – ECII Dec 06 '14 at 08:06
1

Here are two solutions, one is partly vectorised that uses sapply, and I think the other is fully vectorised, but I've written a couple of functions for part of the vector calculations so there may be a better way.

I also only ran these for nobs1 = 4, 40 & 400 and nobs2 = 5, 50 & 500 as my laptop struggles for memory with large matrices.

Solution - sapply

R <- 6371 # Earth mean radius [km]

delta.lon <- sapply(mylon2, "-", mylon1)
delta.lon <- sin(delta.lon/2)^2

coslat2 <- cos(mylat2)
coslat1 <- cos(mylat1)

delta.lan <- sapply(mylat2, "-", mylat1)
delta.lan <- sin(delta.lan/2)^2

a <- delta.lan + t(sapply(coslat1, "*", coslat2) * t(delta.lon))
b <- ifelse(sqrt(a)<1,sqrt(a),1)
c <- asin(b)
d <- 2 * c
e <- R * d

f <- c(t(e))

And to check the output

sum(f)
sum(mydistance)

all.equal(f, mydistance)


> sum(f)
[1] 647328856
> sum(mydistance)
[1] 647328856
> all.equal(f, mydistance)
[1] TRUE

Solution - with functions

R <- 6371 # Earth mean radius [km]

#function to calculate the difference between each
#element of different sized vectors and return a matrix
vecEleDif <- function(vec1, vec2)
{
    #the order of arguments is vec2 - vec1
    len1 <- length(vec1);len2 <- length(vec2)
    dummy1 <- matrix(rep.int(vec1, len2), nrow=len1)
    dummy2 <- matrix(rep.int(vec2, len1), nrow=len2)
    res <- t(dummy2 - t(dummy1))
}

#Function to calculate the product of each 
#element of two different size vectors
vecEleProd <- function(vec1, vec2)
{
    #the order of the arguments is vec2 * vec1
    len1 <- length(vec1); len2 <- length(vec2)
    dummy1 <- matrix(rep.int(vec1, len2), nrow=len1)
    dummy2 <- matrix(rep.int(vec2, len1), nrow=len2)
    res <- t(dummy2 * t(dummy1))
}

ptm <- proc.time()

delta.lon <- sin(vecEleDif(mylon2, mylon1)/2)^2
delta.lan <- sin(vecEleDif(mylat2, mylat1)/2)^2
cosprod <- vecEleProd(cos(mylat1), cos(mylat2))

a <- delta.lan + (t(cosprod) * delta.lon)
b <- ifelse(sqrt(a)<1,sqrt(a),1)
c <- asin(b)
d <- 2 * c
e <- R * d
f <- c((e))

proc.time() - ptm

and check the output:

sum(f)
sum(mydistance)

all.equal(f, mydistance)

> sum(f)
[1] 647745044
> sum(mydistance)
[1] 647745044
> 
> all.equal(f, mydistance)
[1] TRUE
tospig
  • 7,762
  • 14
  • 40
  • 79
  • Since you asked me to benchmark your solution: I tested your second option (without sapply) and it took ~ 11 seconds while using my answer it took ~ 4 seconds (I'm on a different computer today) – talat Dec 08 '14 at 15:32