4

I have 2 datasets that looks like the following (with about 600 total entries for both) Dataframe1

name A_time measurement_A
A 6/10/22 7:22 3.4
A 6/10/22 16:15 4.4
A 6/11/22 6:15 5.5
A 6/11/22 15:44 6.6
B 5/20/22 6:30 7.6
B 5/20/22 13:21 1.2
B 5/21/22 8:43 2.4
B 5/21/22 14:35 4.4

Dataframe2

name B_time measurement_B
A 6/10/22 7:24 50
A 6/10/22 16:55 50.1
A 6/11/22 6:13 52
A 6/11/22 15:21 61
B 5/20/22 6:30 54
B 5/20/22 13:22 53
B 5/21/22 8:00 54.6
B 5/21/22 14:12 76.1

I want to combine these two dataframes by name and date, selecting for rows only where Dataframe1 and Dataframe2 are within 30 minutes of each other.

I'm hoping to get something like this: Dataframe3

name A_time measurement_A measurement_B
A 6/10/22 7:22 3.4 50
A 6/11/22 6:15 5.5 52
A 6/11/22 15:44 6.6 61
B 5/20/22 6:30 7.6 54
B 5/20/22 13:21 1.2 53
B 5/21/22 14:35 4.4 76.1

The code I am using:

library(dplyr)

dataframe3 <- dataframe1 %>%
  inner_join(dataframe2, by = "name") %>%
  filter(abs(A_time - B_time) <= as.difftime(30, units = "mins"))

But I am getting an error because the names in dataframe1 correlated to multiple rows in dataframe2. How can I get the Dataframe3 that I want?

Darren Tsai
  • 32,117
  • 5
  • 21
  • 51
ol1975
  • 51
  • 3
  • Hi, have any answers solved your question? You could consider giving upvotes and picking one answer you prefer as accepted by clicking the check mark. Thanks! – Darren Tsai Jul 27 '23 at 12:33

4 Answers4

3

merge, then subset, conveniently in a list.

Reduce(\(...) merge(..., all=TRUE), list(d1, d2)) |>
  subset(abs(as.integer(A_time - B_time)/60L) <= 30L, -B_time)
#    name              A_time measurement_A measurement_B
# 7     A 2022-06-11 06:15:00           5.5          52.0
# 12    A 2022-06-11 15:44:00           6.6          61.0
# 13    A 2022-06-10 07:22:00           3.4          50.0
# 18    B 2022-05-20 06:30:00           7.6          54.0
# 21    B 2022-05-20 13:21:00           1.2          53.0
# 32    B 2022-05-21 14:35:00           4.4          76.1

Data (assorted rows!):

d1 <- structure(list(name = c("A", "A", "A", "A", "B", "B", "B", "B"
), A_time = c("6/10/22 16:15", "6/11/22 6:15", "6/11/22 15:44", 
"6/10/22 7:22", "5/20/22 6:30", "5/20/22 13:21", "5/21/22 8:43", 
"5/21/22 14:35"), measurement_A = c(4.4, 5.5, 6.6, 3.4, 7.6, 
1.2, 2.4, 4.4)), class = "data.frame", row.names = c(NA, -8L))

d2 <- structure(list(name = c("A", "A", "A", "A", "B", "B", "B", "B"
), B_time = c("6/10/22 7:24", "6/10/22 16:55", "6/11/22 6:13", 
"6/11/22 15:21", "5/20/22 13:22", "5/20/22 6:30", "5/21/22 8:00", 
"5/21/22 14:12"), measurement_B = c(50, 50.1, 52, 61, 53, 54, 
54.6, 76.1)), class = "data.frame", row.names = c(NA, -8L))

d1$A_time <- strptime(d1$A_time, '%m/%d/%y %H:%M')
d2$B_time <- strptime(d2$B_time, '%m/%d/%y %H:%M')
jay.sf
  • 60,139
  • 8
  • 53
  • 110
2

If your system can handle an almost cross join, lubridate for dates

library(lubridate)

df1$A_time <- as_datetime(df1$A_time, format="%m/%d/%y %H:%M")
df2$B_time <- as_datetime(df2$B_time, format="%m/%d/%y %H:%M")

df3 <- merge(df1, df2, by="name")
df3[abs(difftime(df3$A_time, df3$B_time, units="mins")) <= 30, ]   
#    name              A_time measurement_A              B_time measurement_B
# 1     A 2022-06-10 07:22:00           3.4 2022-06-10 07:24:00          50.0
# 11    A 2022-06-11 06:15:00           5.5 2022-06-11 06:13:00          52.0
# 16    A 2022-06-11 15:44:00           6.6 2022-06-11 15:21:00          61.0
# 17    B 2022-05-20 06:30:00           7.6 2022-05-20 06:30:00          54.0
# 22    B 2022-05-20 13:21:00           1.2 2022-05-20 13:22:00          53.0
# 32    B 2022-05-21 14:35:00           4.4 2022-05-21 14:12:00          76.1
jay.sf
  • 60,139
  • 8
  • 53
  • 110
user2974951
  • 9,535
  • 1
  • 17
  • 24
2

You can set relationship = "many-to-many" in inner_join() to silence the warning.

df1 %>%
  mutate(A_time = strptime(A_time, '%m/%d/%y %H:%M')) %>%
  inner_join(df2 %>% mutate(B_time = strptime(B_time, '%m/%d/%y %H:%M')),
             by = "name", relationship = "many-to-many") %>%
  filter(abs(difftime(A_time, B_time, units = "mins")) <= 30) %>%
  select(-B_time)

Another dplyr solution with overlap joins:

library(dplyr)

df1 %>%
  mutate(A_time = strptime(A_time, '%m/%d/%y %H:%M'),
         A_lower = A_time - 30*60,
         A_upper = A_time + 30*60) %>%
  inner_join(df2 %>% mutate(B_time = strptime(B_time, '%m/%d/%y %H:%M')),
             by = join_by(name, between(y$B_time, x$A_lower, x$A_upper))) %>%
  select(-c(A_lower, A_upper, B_time))

# # A tibble: 6 × 4
#   name  A_time              measurement_A measurement_B
#   <chr> <dttm>                      <dbl>         <dbl>
# 1 A     2022-06-10 07:22:00           3.4          50  
# 2 A     2022-06-11 06:15:00           5.5          52  
# 3 A     2022-06-11 15:44:00           6.6          61  
# 4 B     2022-05-20 06:30:00           7.6          54  
# 5 B     2022-05-20 13:21:00           1.2          53  
# 6 B     2022-05-21 14:35:00           4.4          76.1

Data
df1 <- structure(list(name = c("A", "A", "A", "A", "B", "B", "B", "B"
), A_time = c("6/10/22 7:22", "6/10/22 16:15", "6/11/22 6:15", 
"6/11/22 15:44", "5/20/22 6:30", "5/20/22 13:21", "5/21/22 8:43", 
"5/21/22 14:35"), measurement_A = c(3.4, 4.4, 5.5, 6.6, 7.6, 
1.2, 2.4, 4.4)), class = "data.frame", row.names = c(NA, -8L))

df2 <- structure(list(name = c("A", "A", "A", "A", "B", "B", "B", "B"
), B_time = c("6/10/22 7:24", "6/10/22 16:55", "6/11/22 6:13", 
"6/11/22 15:21", "5/20/22 6:30", "5/20/22 13:22", "5/21/22 8:00", 
"5/21/22 14:12"), measurement_B = c(50, 50.1, 52, 61, 54, 53, 
54.6, 76.1)), class = "data.frame", row.names = c(NA, -8L))
Darren Tsai
  • 32,117
  • 5
  • 21
  • 51
1

You can use merge with all = TRUE + subset

subset(
    merge(df1, df2, all = TRUE),
    abs(difftime(
        strptime(A_time, "%m/%d/%y %H:%M"),
        strptime(B_time, "%m/%d/%y %H:%M"),
        units = "mins"
    )) <= 30,
    select = -B_time
)

which gives

   name        A_time measurement_A measurement_B
7     A  6/11/22 6:15           5.5          52.0
12    A 6/11/22 15:44           6.6          61.0
13    A  6/10/22 7:22           3.4          50.0
18    B  5/20/22 6:30           7.6          54.0
21    B 5/20/22 13:21           1.2          53.0
32    B 5/21/22 14:35           4.4          76.1
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81