I would like to find what is the most memory and time efficient way to calculate euclidean distances on a large matrix. I've ran this small benchmark below comparing a few packages I know: parallelDist
, geodist
, fields
and stats
. I've also considered this customized function that combines Rcpp
and bigmemory
. Here are the results I've found (reprex below), but I'd like to know whether there are other efficient pacakges / solutions to do this task:
Results
benchmrk
#> package time alloc
#>1: parDist 0.298 5.369186e-04
#>2: fields 1.079 9.486198e-03
#>3: rcpp 54.422 2.161113e+00
#>4: stats 0.770 5.788603e+01
#>5: geodist 2.513 1.157635e+02
# plot
ggplot(benchmrk, aes(x=alloc , y=time, color= package, label=package)) +
geom_label(alpha=.5) +
coord_trans(x="log10", y="log10") +
theme(legend.position = "none")
Reprex
library(parallelDist)
library(geodist)
library(fields)
library(stats)
library(bigmemory)
library(Rcpp)
library(lineprof)
library(geobr)
library(sf)
library(ggplot2)
library(data.table)
# data input
df <- geobr::read_weighting_area()
gc(reset = T)
# convert projection to UTM
df <- st_transform(df, crs = 3857)
# get spatial coordinates
coords <- suppressWarnings(st_coordinates( st_centroid(df) ))
# prepare customized rcpp function
sourceCpp("euc_dist.cpp")
bigMatrixEuc <- function(bigMat){
zeros <- big.matrix(nrow = nrow(bigMat)-1,
ncol = nrow(bigMat)-1,
init = 0,
type = typeof(bigMat))
BigArmaEuc(bigMat@address, zeros@address)
return(zeros)
}
### Start tests
perf_fields <- lineprof(dist_fields <- fields::rdist(coords) )
perf_geodist <- lineprof(dist_geodist <- geodist::geodist(coords, measure = "cheap") )
perf_stats <- lineprof(dist_stats <- stats::dist(coords) )
perf_parDist <- lineprof(dist_parDist <- parallelDist::parDist(coords, method = "euclidean") )
perf_rcpp <- lineprof(dist_rcpp <- bigMatrixEuc( as.big.matrix(coords) ) )
perf_fields$package <- 'fields'
perf_geodist$package <- 'geodist'
perf_stats$package <- 'stats'
perf_parDist$package <- 'parDist'
perf_rcpp$package <- 'rcpp'
# gather results
benchmrk <- rbind(perf_fields, perf_geodist, perf_stats , perf_parDist, perf_rcpp)
benchmrk <- setDT(benchmrk)[, .(time =sum(time), alloc = sum(alloc)), by=package][order(alloc)]
benchmrk