0

I am working with the R programming language.

I have the following data frames:

set.seed(123)

df_1 <- data.frame(
  name_1 = c("john", "david", "alex", "kevin", "trevor", "xavier", "tom", "michael", "troy", "kelly", "chris", "henry", "taylor", "ryan", "peter"),
  lon = rnorm(15, mean = -74.0060, sd = 0.01),
  lat = rnorm(15, mean = 40.7128, sd = 0.01)
)

df_2 <- data.frame(
  name_2 = c("matthew", "tyler", "sebastian", "julie", "anna", "tim", "david", "nigel", "sarah", "steph", "sylvia", "boris", "theo", "malcolm"),
  lon = rnorm(14, mean = -74.0060, sd = 0.01),
  lat = rnorm(14, mean = 40.7128, sd = 0.01)
)

My Problem: I want to find out the distance between all people from df_1 and df_2 and then perform some summary statistics (e.g. for each person in df_1 - what is the distance to the closest person df_2, furthest person in df_2, average person in df_2, etc.)

I did this myself like this:

 library(geosphere)

haversine_distance <- function(lon1, lat1, lon2, lat2) {
  distHaversine(c(lon1, lat1), c(lon2, lat2))
}


distances <- matrix(nrow = nrow(df_1), ncol = nrow(df_2))
# calculate the distances
for (i in 1:nrow(df_1)) {
    for (j in 1:nrow(df_2)) {
        distances[i, j] <- haversine_distance(df_1$lon[i], df_1$lat[i], df_2$lon[j], df_2$lat[j])
    }
}



final <- data.frame(
    name_1 = rep(df_1$name_1, each = nrow(df_2)),
    lon_1 = rep(df_1$lon, each = nrow(df_2)),
    lat_1 = rep(df_1$lat, each = nrow(df_2)),
    name_2 = rep(df_2$name_2, nrow(df_1)),
    lon_2 = rep(df_2$lon, nrow(df_1)),
    lat_2 = rep(df_2$lat, nrow(df_1)),
    distance = c(distances)
)

final_summary <- aggregate(distance ~ name_1,
                           data = final,
                           FUN = function(x) c(min = min(x),
                                               max = max(x),
                                               mean = mean(x),
                                               median = median(x),
                                               sd = sd(x)))
final_summary <- do.call(data.frame, final_summary)
names(final_summary)[-(1)] <- c("min_distance", "max_distance", "mean_distance", "median_distance", "sd_distance")

In another question (R: Improving the Speed of Pairwise Calculations), I learned how to do the same thing but in a more efficient way (I made some modifications):

 haversine_distance <- function(lon1, lat1, lon2, lat2) {
  distHaversine(cbind(lon1, lat1), cbind(lon2, lat2))
}

# calculate the distances
distances <- expand.grid(i = seq_len(nrow(df_1)), j = seq_len(nrow(df_2)))
distances$dist <- haversine_distance(
  df_1$lon[distances$i],
  df_1$lat[distances$i],
  df_2$lon[distances$j],
  df_2$lat[distances$j]
)

final <- data.frame(
  name_1 = df_1$name_1[distances$i],
  lon_1 = df_1$lon[distances$i],
  lat_1 = df_1$lat[distances$i],
  name_2 = df_2$name_2[distances$j],
  lon_2 = df_2$lon[distances$j],
  lat_2 = df_2$lat[distances$j],
  distance = distances$dist
)

final_summary1 <- aggregate(
  distance ~ name_1,
  data = final,
  FUN = function(x) c(min = min(x), max = max(x), mean = mean(x), median = median(x), sd = sd(x))
)
final_summary1 <- do.call(data.frame, final_summary1)
names(final_summary1)[-(1)] <- c("min_distance", "max_distance", "mean_distance", "median_distance", "sd_distance")

My Question: When I compare the results from both methods on the same data - apparently they are not the same:

> identical(final_summary, final_summary1)
[1] FALSE

Can someone please help me understand why these two outputs are not the same and how can I make them the same? Are both approaches correct?

Thanks!

Dave2e
  • 22,192
  • 18
  • 42
  • 50
stats_noob
  • 5,401
  • 4
  • 27
  • 83

1 Answers1

1

The distHaversine function is vectorized so there is no need to calculate the distances in a loop.
Just create a data frame of the all the combinations, like you did in your second part and then calculate the distances for all of the rows.

#all of the pairwise combinations
grid <- expand.grid(i = seq_len(nrow(df_1)), j = seq_len(nrow(df_2)))

#create master data frame with the pairwise information name & locations
workingdf <- data.frame(
      name_1 = df_1$name_1[grid$i],
      lon_1 = df_1$lon[grid$i],
      lat_1 = df_1$lat[grid$i],
      name_2 = df_2$name_2[grid$j],
      lon_2 = df_2$lon[grid$j],
      lat_2 = df_2$lat[grid$j]
)

#calculate the distances for every row, taking the columns as input
workingdf$distance <- distHaversine(workingdf[ ,c("lon_1", "lat_1")], workingdf[ ,c("lon_2", "lat_2")])

#summarize
final_summary1 <- aggregate(
   distance ~ name_1,
   data = workingdf,
   FUN = function(x) c(min = min(x), max = max(x), mean = mean(x), median = median(x), sd = sd(x))
)
Dave2e
  • 22,192
  • 18
  • 42
  • 50