I am working with the R programming language.
I have the following data frames:
set.seed(123)
df_1 <- data.frame(
name_1 = c("john", "david", "alex", "kevin", "trevor", "xavier", "tom", "michael", "troy", "kelly", "chris", "henry", "taylor", "ryan", "peter"),
lon = rnorm(15, mean = -74.0060, sd = 0.01),
lat = rnorm(15, mean = 40.7128, sd = 0.01)
)
df_2 <- data.frame(
name_2 = c("matthew", "tyler", "sebastian", "julie", "anna", "tim", "david", "nigel", "sarah", "steph", "sylvia", "boris", "theo", "malcolm"),
lon = rnorm(14, mean = -74.0060, sd = 0.01),
lat = rnorm(14, mean = 40.7128, sd = 0.01)
)
My Problem: I want to find out the distance between all people from df_1 and df_2 and then perform some summary statistics (e.g. for each person in df_1 - what is the distance to the closest person df_2, furthest person in df_2, average person in df_2, etc.)
I did this myself like this:
library(geosphere)
haversine_distance <- function(lon1, lat1, lon2, lat2) {
distHaversine(c(lon1, lat1), c(lon2, lat2))
}
distances <- matrix(nrow = nrow(df_1), ncol = nrow(df_2))
# calculate the distances
for (i in 1:nrow(df_1)) {
for (j in 1:nrow(df_2)) {
distances[i, j] <- haversine_distance(df_1$lon[i], df_1$lat[i], df_2$lon[j], df_2$lat[j])
}
}
final <- data.frame(
name_1 = rep(df_1$name_1, each = nrow(df_2)),
lon_1 = rep(df_1$lon, each = nrow(df_2)),
lat_1 = rep(df_1$lat, each = nrow(df_2)),
name_2 = rep(df_2$name_2, nrow(df_1)),
lon_2 = rep(df_2$lon, nrow(df_1)),
lat_2 = rep(df_2$lat, nrow(df_1)),
distance = c(distances)
)
final_summary <- aggregate(distance ~ name_1,
data = final,
FUN = function(x) c(min = min(x),
max = max(x),
mean = mean(x),
median = median(x),
sd = sd(x)))
final_summary <- do.call(data.frame, final_summary)
names(final_summary)[-(1)] <- c("min_distance", "max_distance", "mean_distance", "median_distance", "sd_distance")
In another question (R: Improving the Speed of Pairwise Calculations), I learned how to do the same thing but in a more efficient way (I made some modifications):
haversine_distance <- function(lon1, lat1, lon2, lat2) {
distHaversine(cbind(lon1, lat1), cbind(lon2, lat2))
}
# calculate the distances
distances <- expand.grid(i = seq_len(nrow(df_1)), j = seq_len(nrow(df_2)))
distances$dist <- haversine_distance(
df_1$lon[distances$i],
df_1$lat[distances$i],
df_2$lon[distances$j],
df_2$lat[distances$j]
)
final <- data.frame(
name_1 = df_1$name_1[distances$i],
lon_1 = df_1$lon[distances$i],
lat_1 = df_1$lat[distances$i],
name_2 = df_2$name_2[distances$j],
lon_2 = df_2$lon[distances$j],
lat_2 = df_2$lat[distances$j],
distance = distances$dist
)
final_summary1 <- aggregate(
distance ~ name_1,
data = final,
FUN = function(x) c(min = min(x), max = max(x), mean = mean(x), median = median(x), sd = sd(x))
)
final_summary1 <- do.call(data.frame, final_summary1)
names(final_summary1)[-(1)] <- c("min_distance", "max_distance", "mean_distance", "median_distance", "sd_distance")
My Question: When I compare the results from both methods on the same data - apparently they are not the same:
> identical(final_summary, final_summary1)
[1] FALSE
Can someone please help me understand why these two outputs are not the same and how can I make them the same? Are both approaches correct?
Thanks!