1

I want to merge two datasets. I want to merge this on ID and Date. However some of the dates are 1-2 days after the row they should merge with in the other date so they get excluded. How can I merge on date but allow up to 2 days difference between dates?

My data:

#Random letters to fill out a pathology report
pathRep<-replicate(20,paste(sample(LETTERS,50,replace=T),collapse=""))
pathDate<-as.Date(c("1993-12-22","1994-05-16","1992-07-20","1996-06-02","1992-04-20","1996-08-30","1992-01-26","1991-03-23","1995-12-28","1995-07-15","1993-04-04","1994-01-11","1999-08-21","1993-11-10","1994-02-26","1992-08-06","1993-06-29","1997-03-08","1998-03-03","1998-04-17"))
#Random Numbers
pathHospitalNum<-c("H432243","T662272","G424284","W787634","H432243","Y980037","H432243","W787634","Y980037","E432243","U874287","Y980037","U874287","W787634","Y980037","H432243","Y980037","E432243","W787634","W787634")
#Create the dataframe
pathdf<-data.frame(pathRep,pathDate,pathHospitalNum)

#Random letters to fill out a pathology report
EndoRep<-replicate(20,paste(sample(LETTERS,50,replace=T),collapse=""))
EndoDate<-as.Date(c("1993-12-22","1994-05-14","1992-07-19","1996-06-01","1992-04-20","1996-08-30","1992-01-24","1991-03-21","1995-12-28","1995-07-15","1993-04-02","1994-01-10","1999-08-21","1993-11-10","1994-02-26","1992-08-05","1993-06-29","1997-03-07","1998-03-03","1998-04-17"))
#Random Numbers
EndoHospitalNum<-c("H432243","T662272","G424284","W787634","H432243","Y980037","H432243","W787634","Y980037","E432243","U874287","Y980037","U874287","W787634","Y980037","H432243","Y980037","E432243","W787634","W787634")
#Create the dataframe:
Endodf<-data.frame(EndoRep,EndoDate,EndoHospitalNum)

This just merges the exact dates:

merge(Endodf,pathdf,by=c("Date","HospNum"))

I thought maybe I could create a difftime column but I guess I would end up comparing every date to every date which may be time consuming?

Sebastian Zeki
  • 6,690
  • 11
  • 60
  • 125
  • check the link https://stackoverflow.com/questions/9269179/r-speeding-up-approximate-date-match-idata-frame – BENY Jul 20 '17 at 18:15
  • If you have a one-one connexion, you could sort them by date, and just `cbind` the columns of interest. – F. Privé Jul 20 '17 at 18:54

2 Answers2

2

While @alaybourn's answer with the data.table rolling join is quite nice, I'm going to add another option to address the "allow up to 2 day difference between dates" part of the question (but mostly just to share some love for the fuzzyjoin package).

library(dplyr)
library(fuzzyjoin)

Path <-
  data.frame(
    PathDate = as.Date(c("1993-12-22", "1994-05-16", "1992-07-20", "1996-06-02", "1992-04-20", "1996-08-30", "1992-01-26", "1991-03-23", "1995-12-28", "1995-07-15", "1993-04-04", "1994-01-11", "1999-08-21", "1993-11-10", "1994-02-26", "1992-08-06", "1993-06-29", "1997-03-08", "1998-03-03", "1998-04-17")),
    PathHospNum  = c("H432243", "T662272", "G424284", "W787634", "H432243", "Y980037", "H432243", "W787634", "Y980037", "E432243", "U874287", "Y980037", "U874287", "W787634", "Y980037", "H432243", "Y980037", "E432243", "W787634", "W787634"),
    PathRep  = replicate(20, paste(sample(LETTERS, 10, replace = T), collapse = ""))
  )

Endo <- 
  data.frame(
    EndoDate = as.Date(c("1993-12-22", "1994-05-14", "1992-07-19", "1996-06-01", "1992-04-20", "1996-08-30", "1992-01-24", "1991-03-21", "1995-12-28", "1995-07-15", "1993-04-02", "1994-01-10", "1999-08-21", "1993-11-10", "1994-02-26", "1992-08-05", "1993-06-29", "1997-03-07", "1998-03-03", "1998-04-17")),
    EndoHospNum  = c("H432243", "T662272", "G424284", "W787634", "H432243", "Y980037", "H432243", "W787634", "Y980037", "E432243", "U874287", "Y980037", "U874287", "W787634", "Y980037", "H432243", "Y980037", "E432243", "W787634", "W787634"),
    EndoRep  = replicate(20, paste(sample(LETTERS, 10, replace = T), collapse = ""))
  )

Path$date <- as.numeric(Path$PathDate)
Endo$date <- as.numeric(Endo$EndoDate)

Result <- 
  fuzzyjoin::difference_full_join(Endo, Path, by = 'date', max_dist = 2, distance_col = 'Days') %>%
  filter(EndoHospNum == PathHospNum) %>%
  select(HospNum = EndoHospNum, EndoDate, PathDate, Days, EndoRep, PathRep)

Result

   HospNum   EndoDate   PathDate Days    EndoRep    PathRep
1  H432243 1993-12-22 1993-12-22    0 YBGDMGMZOJ HISSGSCRFR
2  T662272 1994-05-14 1994-05-16    2 ANAPSCKUEB HIDIFHBDBL
3  G424284 1992-07-19 1992-07-20    1 HKOCQZAXDU PLXGUPHQBM
4  W787634 1996-06-01 1996-06-02    1 OTPLUZBLAF KGVILKEHLI
5  H432243 1992-04-20 1992-04-20    0 GRWJUQPNET UGCKMNKDLW
6  Y980037 1996-08-30 1996-08-30    0 ORUVMMGGAV EYOWEYAZFK
7  H432243 1992-01-24 1992-01-26    2 JVSPGIVXEM LCNXQNVGGR
8  W787634 1991-03-21 1991-03-23    2 WXZNHJIBZW OTXKNTYNKV
9  Y980037 1995-12-28 1995-12-28    0 PQQQLKTYPG UAMMKJZRFG
10 E432243 1995-07-15 1995-07-15    0 VYLDWUNAFP EXNTQSYVJM
11 U874287 1993-04-02 1993-04-04    2 MTBBBVULOD CTKXUKEOQG
12 Y980037 1994-01-10 1994-01-11    1 TRZWBYAUZR XIHXMOEFVP
13 U874287 1999-08-21 1999-08-21    0 DYBUWJIAZB KFFGYNQUYM
14 W787634 1993-11-10 1993-11-10    0 CBWRBZAPAF KYUOZSLIGF
15 Y980037 1994-02-26 1994-02-26    0 GDUFEYZQFU BUSFQIJDHK
16 H432243 1992-08-05 1992-08-06    1 JMGSCWTHOI ZRCJFDFNCX
17 Y980037 1993-06-29 1993-06-29    0 HDTGHCMORL EQYWNJHOET
18 E432243 1997-03-07 1997-03-08    1 WIMMVJHDSE LYLDELIBYK
19 W787634 1998-03-03 1998-03-03    0 GRHBDHEWJF AWDYEQZZWY
20 W787634 1998-04-17 1998-04-17    0 AOFIXWLZDT BBUEROUIWO
Eric
  • 3,403
  • 1
  • 19
  • 18
  • I already awarded the points but I actually like your answer a lot more. Definitely going to check out fuzzy logic package. Looks very interesting. Thank you – Sebastian Zeki Jul 21 '17 at 05:36
1

If you use the roll="nearest" option in data.table it will work for this set, but if you're trying to not join dates > 2 days different it will fail.

library(data.table)

#Random letters to fill out a pathology report
pathRep<-replicate(20,paste(sample(LETTERS,50,replace=T),collapse=""))
pathDate<-as.Date(c("1993-12-22","1994-05-16","1992-07-20","1996-06-02","1992-04-20","1996-08-30","1992-01-26","1991-03-23","1995-12-28","1995-07-15","1993-04-04","1994-01-11","1999-08-21","1993-11-10","1994-02-26","1992-08-06","1993-06-29","1997-03-08","1998-03-03","1998-04-17"))
#Random Numbers
pathHospitalNum<-c("H432243","T662272","G424284","W787634","H432243","Y980037","H432243","W787634","Y980037","E432243","U874287","Y980037","U874287","W787634","Y980037","H432243","Y980037","E432243","W787634","W787634")
#Create the data table and set key fields
pathdt<-data.table(pathRep,pathDate,pathHospitalNum)
setkey(pathdt, pathHospitalNum, pathDate)


#Random letters to fill out a pathology report
EndoRep<-replicate(20,paste(sample(LETTERS,50,replace=T),collapse=""))
EndoDate<-as.Date(c("1993-12-22","1994-05-14","1992-07-19","1996-06-01","1992-04-20","1996-08-30","1992-01-24","1991-03-21","1995-12-28","1995-07-15","1993-04-02","1994-01-10","1999-08-21","1993-11-10","1994-02-26","1992-08-05","1993-06-29","1997-03-07","1998-03-03","1998-04-17"))
#Random Numbers
EndoHospitalNum<-c("H432243","T662272","G424284","W787634","H432243","Y980037","H432243","W787634","Y980037","E432243","U874287","Y980037","U874287","W787634","Y980037","H432243","Y980037","E432243","W787634","W787634")
#Create the data table and set keys
Endodt<-data.table(EndoRep,EndoDate,EndoHospitalNum)
setkey(Endodt, EndoHospitalNum, EndoDate)

#run the join
Endodt[pathdt,roll="nearest"]
alaybourn
  • 304
  • 2
  • 12