3

I have two databases that I want to merge. From this link: Doing a "fuzzy" and non-fuzzy, many to 1 merge with data.table. I know that I can merge these data.tables, when there is no direct match, with the nearest year available as follows.:

  library(data.table)
  dfA <- fread("
  A   B   C   D   E   F   G   Z   iso   year   matchcode
  1   0   1   1   1   0   1   0   NLD   2010   NLD2010
  2   1   0   0   0   1   0   1   NLD   2014   NLD2014
  3   0   0   0   1   1   0   0   AUS   2010   AUS2010
  4   1   0   1   0   0   1   0   AUS   2006   AUS2006
  5   0   1   0   1   0   1   1   USA   2008   USA2008
  6   0   0   1   0   0   0   1   USA   2010   USA2010
  7   0   1   0   1   0   0   0   USA   2012   USA2012
  8   1   0   1   0   0   1   0   BLG   2008   BLG2008
  9   0   1   0   1   1   0   1   BEL   2008   BEL2008
  10  1   0   1   0   0   1   0   BEL   2010   BEL2010
  11  0   1   1   1   0   1   0   NLD   2010   NLD2010
  12  1   0   0   0   1   0   1   NLD   2014   NLD2014
  13  0   0   0   1   1   0   0   AUS   2010   AUS2010
  14  1   0   1   0   0   1   0   AUS   2006   AUS2006
  15  0   1   0   1   0   1   1   USA   2008   USA2008
  16  0   0   1   0   0   0   1   USA   2010   USA2010
  17  0   1   0   1   0   0   0   USA   2012   USA2012
  18  1   0   1   0   0   1   0   BLG   2008   BLG2008
  19  0   1   0   1   1   0   1   BEL   2008   BEL2008
  20  1   0   1   0   0   1   0   BEL   2010   BEL2010",
  header = TRUE)

  dfB <- fread("
  A   B   C   D   H   I   J   K   iso   year   matchcode
  1   0   1   1   1   0   1   0   NLD   2009   NLD2009
  2   1   0   0   0   1   0   1   NLD   2014   NLD2018
  3   0   0   0   1   1   0   0   AUS   2011   AUS2011
  4   1   0   1   0   0   1   0   AUS   2007   AUS2007
  5   0   1   0   1   0   1   1   USA   2007   USA2007
  6   0   0   1   0   0   0   1   USA   2010   USA2010
  7   0   1   0   1   0   0   0   USA   2013   USA2013
  8   1   0   1   0   0   1   0   BLG   2007   BLG2007
  9   0   1   0   1   1   0   1   BEL   2009   BEL2009
  10   1   0   1   0   0   1   0  BEL   2012   BEL2012",
  header = TRUE)

#change the name of the matchcode-column
setnames(dfA, c("matchcode", "iso", "year"), c("matchcodeA", "isoA", "yearA"))
setnames(dfB, c("matchcode", "iso", "year"), c("matchcodeB", "isoB", "yearB"))

#store column-order for in the end
namesA <- as.character( names( dfA ) )
namesB <- as.character( setdiff( names(dfB), names(dfA) ) )
colorder <- c(namesA, namesB)

#create columns to join on
dfA[, `:=`(iso.join = isoA, year.join = yearA)]
dfB[, `:=`(iso.join = isoB, year.join = yearB)]

#perform left join
result <- dfB[dfA, on = c("iso.join", "year.join"),roll = "nearest" ]

#drop columns that are not needed
result[, grep("^i\\.", names(result)) := NULL ]
result[, grep("join$", names(result)) := NULL ]

#set column order
setcolorder(result, colorder)

I have two questions about this.

1) EDIT: This question was the result of a typo

2) NLD 2014in dfA is matched with NLD 2018in dfB. What can I do if I think 4 years difference is too much and I want to limit it to two years?

What do I do when I want to limit the allowed amount of years between dfA and dfB?

Tom
  • 2,173
  • 1
  • 17
  • 44

1 Answers1

4

You have two options:

  1. Use roll = 2 or roll = -2 which will require the nearest to be within 2 years of only one direction.
  2. Add two more columns to dfA to make it an explicit non-equi join.
#perform left join
result <- dfB[dfA, on = c("iso.join", "year.join"), roll = 2 ] 

# or
result <- dfB[dfA, on = c("iso.join", "year.join"), roll = -2 ] 

The non-equi join would require additional work on your part as it does not take a roll = 'nearest' argument so you either need to use mult = 'first' or do a filter in a subsequent operation.

dfA[, `:=`(min_year.join = yearA - 2,
           max_year.join = yearA + 2)]

result <- dfB[dfA,
              on = .(iso.join,
                          year.join <= max_year.join,
                          year.join >= min_year.join)
              #, mult = 'first'
              ]

#drop columns that are not needed
result[, grep("^i\\.", names(result)) := NULL ]
result[, grep("join", names(result)) := NULL ] #removed $

#set column order
setcolorder(result, colorder)
result
Cole
  • 11,130
  • 1
  • 9
  • 24