3

I have the following dataset:

id    x         y       age
1  1745353   930284.1    30
2  1745317   930343.4    23
3  1745201   930433.9    10
4  1745351   930309.4    5
5  1745342   930335.2    2
6  1746619   929969.7    66
7  1746465   929827.1    7
8  1746731   928779.5    55
9  1746629   929902.6    26
10 1745938   928923.2    22

I want to find 5 closest neighbors for each of the id based on the distance calculated from the given (x,y). The final output should look like the following:

id      n_id    dist  age  age_n_id
1        2       2     30    23
1        5       1.5   30    2
1        3       5     30    10
1        7       3     30    7
1        8       3     30    55
2        1       6     23    30
2        10      1     23    22
2        6       2     23    66
2        7       6     23    7
2        8       9     23    55
3        2       1     10    23
3        1       2     10    30
3        4       1.2   10    5
3        6       1.6   10    66
3        9       2.3   10    26
................................
................................
10       2       1.9   22    23
10       6       2.3   22    66
10       9       2.1   22    26
10       1       2.5   22    30
10       5       1.6   22    2

where n_id is the id if the neighbors, dist is the straight line distance between id and n_id, age is the age of the id, and age_n_id is the age of the n_id. Also, the maximum distance would be 10km. If there are fewer than 5 neighbors within 10km, say 3 neighbors, the corresponding id would be repeated only three times.

I am relatively newer in r programming and any help would be much appreciated.

user2797174
  • 167
  • 2
  • 11
  • This could help: https://stackoverflow.com/questions/23449726/find-k-nearest-neighbors-starting-from-a-distance-matrix – lbusett May 25 '17 at 20:03
  • Are the values in `dist` real? I calculated the distance between `ID 1` and `ID 2`, it is `69.37`. In addition, what is the length unit of the coordinates? – www May 25 '17 at 20:08
  • I looked into the solution in the post before posting my own question. https://stackoverflow.com/questions/23449726/find-k-nearest-neighbors-starting-from-a-distance-matrix works but time consuming. I am looking for an efficient solution(without creating the n x n distance matrix), if possible. – user2797174 May 25 '17 at 20:08

2 Answers2

2

Assuming that the unit of coordinates is in meter.

# Load packages
library(FNN)
library(tidyverse)
library(data.table)

# Create example data frame
dataset <- fread("id    x         y       age
1  1745353   930284.1    30
2  1745317   930343.4    23
3  1745201   930433.9    10
4  1745351   930309.4    5
5  1745342   930335.2    2
6  1746619   929969.7    66
7  1746465   929827.1    7
8  1746731   928779.5    55
9  1746629   929902.6    26
10 1745938   928923.2    22")

# Calculate the nearest ID and distance
near_data <- get.knn(dataset[, 2:3], k = 5)

# Extract the nearest ID
nn_index <- as.data.frame(near_data$nn.index)

# Extract the nearest Distance
nn_dist <- as.data.frame(near_data$nn.dist)

# Re organize the data
nn_index2 <- nn_index %>%
  # Add ID column
  mutate(ID = 1:10) %>%
  # Transform the data frame
  gather(Rank, n_id, -ID)

nn_dist2 <- nn_dist %>%
  # Add ID column
  mutate(ID = 1:10) %>%
  # Transform the data frame
  gather(Rank, dist, -ID)

# Remove coordinates in dataset
dataset2 <- dataset %>% select(-x, -y)

# Create the final output
nn_final <- nn_index2 %>% 
  # Merge nn_index2 and nn_dist2
  left_join(nn_dist2, by = c("ID", "Rank")) %>%
  # Merge with dataset2 by ID and id
  left_join(dataset2, by = c("ID" = "id")) %>%
  # Merge with dataset2 by n_id and id
  left_join(dataset2, by = c("n_id" = "id")) %>%
  # Remove Rank
  select(-Rank) %>%
  # Rename column names
  rename(id = ID, age = age.x, age_n_id = age.y) %>%
  # Sort the data frame
  arrange(id, dist) %>%
  # Filter the dist < 10000 meters
  filter(dist < 10000)
www
  • 38,575
  • 12
  • 48
  • 84
2

data.table solution:

library(data.table)
data<-fread("id    x         y       age
1  1745353   930284.1    30
            2  1745317   930343.4    23
            3  1745201   930433.9    10
            4  1745351   930309.4    5
            5  1745342   930335.2    2
            6  1746619   929969.7    66
            7  1746465   929827.1    7
            8  1746731   928779.5    55
            9  1746629   929902.6    26
            10 1745938   928923.2    22")

data[,all_x:=list(list(x))]
data[,all_y:=list(list(y))]
data[,all_age:=list(list(age))]
data[,seq_nr:=seq_len(.N)]

#Distance formula:
formula_distance<-function(x_1,x_2,y_1,y_2,z){
  x_2<-x_2[[1]][-z]
  y_2<-y_2[[1]][-z]
  sqrt((x_1-x_2)^2+(y_1-y_2)^2)
}

data<-data[,{list(dist = formula_distance(x,all_x,y,all_y,seq_nr), 
                  id =seq(1:nrow(data))[-id],
                  age_id=all_age[[1]][-id],
                  age=rep(age,nrow(data)-1))},by=1:nrow(data)]
data<-data[order(nrow,dist)]
#Filter data within threshold:
threshold<-1000

#How many nearest neighbors to take:
k<-5
filtered<-data[dist<=threshold]
filtered<-filtered[,{list(dist=dist[1:k],n_id=id[1:k],n_age=age_id[1:k])},by=c("nrow","age")]
filtered<-filtered[!is.na(dist)]
setnames(filtered,"nrow","id")

filtered
    id age      dist n_id n_age
 1:  1  30  25.37893    4     5
 2:  1  30  52.27055    5     2
 3:  1  30  69.37211    2    23
 4:  1  30 213.41050    3    10
 5:  2  23  26.31045    5     2
 6:  2  23  48.08326    4     5
 7:  2  23  69.37211    1    30
 8:  2  23 147.12665    3    10
 9:  3  10 147.12665    2    23
10:  3  10 172.11243    5     2
11:  3  10 194.93653    4     5
12:  3  10 213.41050    1    30
13:  4   5  25.37893    1    30
14:  4   5  27.32471    5     2
15:  4   5  48.08326    2    23
16:  4   5 194.93653    3    10
17:  5   2  26.31045    2    23
18:  5   2  27.32471    4     5
19:  5   2  52.27055    1    30
20:  5   2 172.11243    3    10
21:  6  66  67.84106    9    26
22:  6  66 209.88273    7     7
23:  7   7 180.54432    9    26
24:  7   7 209.88273    6    66
25:  8  55 805.91482   10    22
26:  9  26  67.84106    6    66
27:  9  26 180.54432    7     7
28: 10  22 805.91482    8    55
Vitalijs
  • 938
  • 7
  • 18