0

I am working with the R Programming Language.

I have the following data frames:

set.seed(123)

df_1 <- data.frame(
  name_1 = c("john", "david", "alex", "kevin", "trevor", "xavier", "tom", "michael", "troy", "kelly", "chris", "henry", "taylor", "ryan", "peter"),
  lon = rnorm(15, mean = -74.0060, sd = 0.01),
  lat = rnorm(15, mean = 40.7128, sd = 0.01)
)

df_2 <- data.frame(
  name_2 = c("matthew", "tyler", "sebastian", "julie", "anna", "tim", "david", "nigel", "sarah", "steph", "sylvia", "boris", "theo", "malcolm"),
  lon = rnorm(14, mean = -74.0060, sd = 0.01),
  lat = rnorm(14, mean = 40.7128, sd = 0.01)
)

My Problem: I want to find out the distance (e.g. haversine distance) between all people from df_1 and df_2 and then perform some summary statistics (e.g. for each person in df_1 - what is the distance to the closest person df_2, furthest person in df_2, average person in df_2, etc.)

I think I know how to calculate the distance between all pairs of people:

 library(geosphere)

haversine_distance <- function(lon1, lat1, lon2, lat2) {
  distHaversine(c(lon1, lat1), c(lon2, lat2))
}


distances <- matrix(nrow = nrow(df_1), ncol = nrow(df_2))
# calculate the distances
for (i in 1:nrow(df_1)) {
    for (j in 1:nrow(df_2)) {
        distances[i, j] <- haversine_distance(df_1$lon[i], df_1$lat[i], df_2$lon[j], df_2$lat[j])
    }
}

rownames(distances) <- df_1$name_1
colnames(distances) <- df_2$name_2

The distance matrix looks like this:

          matthew     tyler sebastian     julie      anna       tim     david     nigel     sarah     steph    sylvia     boris      theo  malcolm
john    1052.8789 3247.1522 2729.5524 2786.8618 1617.3737 2333.5141 1948.9521 2064.2697 2048.1926  489.7949 2243.3701  423.1875 3760.5821 2663.365
david    965.2928 1805.2741 1380.9300 1423.8518  941.3102 1009.7809  715.3872  602.9369  605.3148  977.5747  895.8301 1134.0782 2440.0083 2026.501
alex    3660.8024 1824.7070 1828.6055 1765.7750 3120.1155 2221.2910 2612.6494 2554.4279 2657.5071 4057.3500 2715.1428 4154.0464 2428.5429 2886.329
kevin    638.9259 2054.2783 1412.4940 1467.9787  639.6901 1017.4015  644.2898  820.1698  887.2768  834.5504 1217.3939  937.2919 2746.7912 1775.239
trevor  1887.7327  807.6626  650.9167  631.9494 1511.9586  640.8622  884.1570  520.1834  603.2698 2094.4664  747.7145 2232.6381 1679.2678 2085.071
xavier  2756.8628 1697.5262 1013.2502  972.9730 2190.7641 1396.8569 1767.1953 1893.9403 2051.9317 3237.6654 2239.1811 3302.8378 2571.7172 1878.935
tom     1587.6118 1192.5724  420.2596  447.8639 1151.8284  243.6729  530.4778  488.9710  675.8181 1903.4973  975.1733 2011.5490 2077.7307 1695.655
michael 2867.3117  825.7050 1950.4505 1912.7913 2672.4188 1954.2652 2093.8370 1504.5642 1361.1719 2768.1872 1012.5247 2967.4984  581.9209 3407.516
troy    2351.8427  549.4500 1383.4185 1352.5158 2107.4240 1365.1343 1513.8307  941.2245  828.5550 2349.1859  560.1101 2531.9797 1035.0599 2818.647
kelly   2169.1191  568.8260 1158.1408 1130.9239 1894.6183 1131.2607 1291.3101  738.7638  658.7015 2219.9931  491.6430 2392.3424 1239.3253 2584.679
chris   3291.8547 1427.3840 1455.8430 1389.1659 2766.7982 1841.1226 2232.4323 2141.2351 2239.5804 3660.7231 2294.9747 3764.9145 2106.7084 2651.150
henry    415.8956 2251.7187 1453.0914 1516.5279  394.8684 1062.2968  670.9220 1027.9476 1129.8791  859.7761 1481.0100  894.5358 2989.7708 1552.172
taylor  1174.1631 1537.5152  746.6206  799.1515  782.6833  358.4041  170.5171  439.8022  635.0847 1504.8199 1016.1560 1601.9382 2359.4940 1567.225
ryan    2625.2224  342.8375 1052.6759  989.0322 2217.8837 1271.4270 1593.4782 1243.7473 1268.9269 2821.0950 1222.0676 2967.2978 1248.0710 2587.342
peter    830.2758 2655.1120 2213.5778 2265.3747 1276.1912 1821.6048 1455.0204 1487.1033 1458.7784  195.4891 1651.2295  414.2948 3176.7362 2416.527

My Question: I am struggling to convert this matrix into a dataframe. I am trying different approaches.

Here are the codes I am using for the summary statistics:

# Approach 1


final <- data.frame(
    name_1 = rep(df_1$name_1, each = nrow(df_2)),
    lon_1 = rep(df_1$lon, each = nrow(df_2)),
    lat_1 = rep(df_1$lat, each = nrow(df_2)),
    name_2 = rep(df_2$name_2, nrow(df_1)),
    lon_2 = rep(df_2$lon, nrow(df_1)),
    lat_2 = rep(df_2$lat, nrow(df_1)),
    distance = c(distances)
)

final_summary <- aggregate(distance ~ name_1,
                           data = final,
                           FUN = function(x) c(min = min(x),
                                               max = max(x),
                                               mean = mean(x),
                                               median = median(x),
                                               sd = sd(x)))
final_summary_1 <- do.call(data.frame, final_summary)
names(final_summary_1)[-(1)] <- c("min_distance", "max_distance", "mean_distance", "median_distance", "sd_distance")

# Approach 2

final_summary_2 <- data.frame(name_1 = df_1$name_1,
                            min_distance = apply(distances, 1, min),
                            max_distance = apply(distances, 1, max),
                            mean_distance = apply(distances, 1, mean),
                            median_distance = apply(distances, 1, median),
                            sd_distance = apply(distances, 1, sd))


#Approach 3 (https://stackoverflow.com/a/76394618/13203841 - optimized for speed)

 haversine_distance <- function(lon1, lat1, lon2, lat2) {
  distHaversine(cbind(lon1, lat1), cbind(lon2, lat2))
}

grid <- expand.grid(i = seq_len(nrow(df_1)), j = seq_len(nrow(df_2)))

#create master data frame with the pairwise information name & locations
workingdf <- data.frame(
      name_1 = df_1$name_1[grid$i],
      lon_1 = df_1$lon[grid$i],
      lat_1 = df_1$lat[grid$i],
      name_2 = df_2$name_2[grid$j],
      lon_2 = df_2$lon[grid$j],
      lat_2 = df_2$lat[grid$j]
)

#calculate the distances for every row, taking the columns as input
workingdf$distance <- distHaversine(workingdf[ ,c("lon_1", "lat_1")], workingdf[ ,c("lon_2", "lat_2")])

#summarize
final_summary_3 <- aggregate(
   distance ~ name_1,
   data = workingdf,
   FUN = function(x) c(min = min(x), max = max(x), mean = mean(x), median = median(x), sd = sd(x))
)

When I analyze the results for "alex" (approach_1, approach_2, approach_3):

  name_1 min_distance max_distance mean_distance median_distance sd_distance
1   alex     342.8375     2729.552      1416.779        1397.956    716.7869

name_1 min_distance max_distance mean_distance median_distance sd_distance
alex   alex     1765.775     4154.046      2749.092        2635.078     774.061

  name_1 distance.min distance.max distance.mean distance.median distance.sd
1   alex     1765.775     4154.046      2749.092        2635.078     774.061

And when I then compare it to the values of the original matrix:

distances["alex", ]
  matthew     tyler sebastian     julie      anna       tim     david     nigel     sarah     steph    sylvia     boris      theo   malcolm 
 3660.802  1824.707  1828.605  1765.775  3120.115  2221.291  2612.649  2554.428  2657.507  4057.350  2715.143  4154.046  2428.543  2886.329 

I notice that:

  • Approach 2 and Approach 3 produce the same results
  • The values for Approach 2 and Approach 3 seem to be correct
  • The values in Approach 1 (e.g. alex min distance 342 meters) do not even appear in the original matrix

Thus - am I correct to conclude that Approach 1 is incorrect while Approach 2 and Approach 3 are both correct?

Thanks!

stats_noob
  • 5,401
  • 4
  • 27
  • 83
  • 2
    How is this substantially different from your [other question](https://stackoverflow.com/questions/76394461/r-comparing-distance-calculations)? Apart from adding a third method that is? Wouldn't it be better to update your first question rather than cluttering SO with near-identical questions? Just a though. Thanks. – L Tyrone Jun 03 '23 at 22:34
  • @ Leroy Tyrone: Thank you for your reply! I acknowledge that this question is similar to my previous question - in this current question, I am trying to clarify a different issue on matching matrices with data frames. In the past, members in the community have advised me not to update old questions but to ask new questions instead. If you think it might be a good idea to delete this current question, I can do this. Thanks! – stats_noob Jun 03 '23 at 22:54

2 Answers2

1

what about:

library(geosphere)
library(dplyr)

the_distances <- 
    expand.grid(name_1 = df_1$name_1, name_2 = df_2$name_2) |> 
    left_join(df_1 |> mutate(coords_1 = cbind(lon, lat)), by = 'name_1') |>
    left_join(df_2 |> mutate(coords_2 = cbind(lon, lat)), by = 'name_2') |> 
    rowwise() |>
    mutate(hav_dist = distHaversine(coords_1, coords_2)) |>
    select(c(starts_with('name_'), hav_dist))
## > the_distances |> head()
## # A tibble: 6 x 3
## # Rowwise: 
##   name_1 name_2  hav_dist
##   <chr>  <chr>      <dbl>
## 1 john   matthew    1564.
## 2 david  matthew    1903.
## 3 alex   matthew    2028.
## ...
the_distances |>
    group_by(name_1) |>
    summarize(min = min(hav_dist),
              ave = mean(hav_dist), 
              max = max(hav_dist)
              )
## # A tibble: 15 x 4
##    name_1    min   ave   max
##    <chr>   <dbl> <dbl> <dbl>
##  1 alex     354. 1361. 3108.
##  2 chris   1477. 2607. 3599.
##  3 david    302. 1519. 2678.
##  4 henry   1289. 2541. 3935.
##  5 john     880. 1862. 2701.
## ...
I_O
  • 4,983
  • 2
  • 2
  • 15
1

In attempt 1 your naming ordering is incorrect. Run these pieces of code with your data frames:

#atttempt1 
haversine_distance <- function(lon1, lat1, lon2, lat2) {
   distHaversine(c(lon1, lat1), c(lon2, lat2))
}

distances <- matrix(nrow = nrow(df_1), ncol = nrow(df_2))
# calculate the distances
for (i in 1:nrow(df_1)) {
   for (j in 1:nrow(df_2)) {
      distances[i, j] <- haversine_distance(df_1$lon[i], df_1$lat[i], df_2$lon[j], df_2$lat[j])
   }
}

final1 <- data.frame(
   name_1 = rep(df_1$name_1, each = nrow(df_2)),
   lon_1 = rep(df_1$lon, each = nrow(df_2)),
   lat_1 = rep(df_1$lat, each = nrow(df_2)),
   name_2 = rep(df_2$name_2, nrow(df_1)),
   lon_2 = rep(df_2$lon, nrow(df_1)),
   lat_2 = rep(df_2$lat, nrow(df_1)),
   distance = c(distances)
)
head(final1, 10)

#attempt 3
grid <- expand.grid(i = seq_len(nrow(df_1)), j = seq_len(nrow(df_2)))

#create master data frame with the pairwise information name & locations
workingdf <- data.frame(
   name_1 = df_1$name_1[grid$i],
   lon_1 = df_1$lon[grid$i],
   lat_1 = df_1$lat[grid$i],
   name_2 = df_2$name_2[grid$j],
   lon_2 = df_2$lon[grid$j],
   lat_2 = df_2$lat[grid$j])

#calculate the distances for every row, taking the columns as input
workingdf$distance <- distHaversine(workingdf[ ,c("lon_1", "lat_1")], workingdf[ ,c("lon_2", "lat_2")])
head(workingdf, 10)

Now compare:

head(final1, 10)
   name_1    lon_1    lat_1    name_2     lon_2    lat_2  distance
1    john -74.0116 40.73067   matthew -74.00174 40.72488 1052.8789
2    john -74.0116 40.73067     tyler -74.00895 40.70157  965.2928
3    john -74.0116 40.73067 sebastian -73.99705 40.70877 3660.8024
4    john -74.0116 40.73067     julie -73.99722 40.70813  638.9259
5    john -74.0116 40.73067      anna -73.99778 40.72060 1887.7327
6    john -74.0116 40.73067       tim -73.99911 40.71197 2756.8628
7    john -74.0116 40.73067     david -74.00046 40.71533 1587.6118
8    john -74.0116 40.73067     nigel -74.00662 40.71251 2867.3117
9    john -74.0116 40.73067     sarah -74.00906 40.71237 2351.8427
10   john -74.0116 40.73067     steph -74.00980 40.72649 2169.1191

with:

head(workingdf, 10)
    name_1     lon_1    lat_1  name_2     lon_2    lat_2  distance
1     john -74.01160 40.73067 matthew -74.00174 40.72488 1052.8789
2    david -74.00830 40.71778 matthew -74.00174 40.72488  965.2928
3     alex -73.99041 40.69313 matthew -74.00174 40.72488 3660.8024
4    kevin -74.00529 40.71981 matthew -74.00174 40.72488  638.9259
5   trevor -74.00471 40.70807 matthew -74.00174 40.72488 1887.7327
6   xavier -73.98885 40.70212 matthew -74.00174 40.72488 2756.8628
7      tom -74.00139 40.71062 matthew -74.00174 40.72488 1587.6118
8  michael -74.01865 40.70254 matthew -74.00174 40.72488 2867.3117
9     troy -74.01287 40.70551 matthew -74.00174 40.72488 2351.8427
10   kelly -74.01046 40.70655 matthew -74.00174 40.72488 2169.1191

As one can see the ordering is incorrect. If you go back and calculate the distance in row 2 you will see that "workingdf" is correct and Attempt 1 is incorrect.

Dave2e
  • 22,192
  • 18
  • 42
  • 50