5

I have two data sets of at least 420,500 observations each, e.g.

dataset1 <- data.frame(col1=c("microsoft","apple","vmware","delta","microsoft"),
                     col2=paste0(c("a","b","c",4,"asd"),".exe"),
                     col3=rnorm(5))

dataset2 <- data.frame(col1=c("apple","cisco","proactive","dtex","microsoft"),
                     col2=paste0(c("a","b","c",4,"asd"),".exe"),
                     col3=rnorm(5))
> dataset1
       col1    col2 col3
1 microsoft   a.exe    2
2     apple   b.exe    1
3    vmware   c.exe    3
4     delta   4.exe    4
5 microsoft asd.exe    5
> dataset2
       col1    col2 col3
1     apple   a.exe    3
2     cisco   b.exe    4
3    vmware   d.exe    1
4     delta   5.exe    5
5 microsoft asd.exe    2

I would like to print all the observations in dataset1 that do not intersect one in dataset2 (comparing both col1 and col2 in each), which in this case would print everything except the last observation - observations 1 & 2 match on col2 but not col1 and observation 3 & 4 match on col1 but not col2, i.e.:

        col1  col2 col3 
1:     apple b.exe    1 
2:     delta 4.exe    4 
3: microsoft a.exe    2 
4:    vmware c.exe    3 
Hack-R
  • 22,422
  • 14
  • 75
  • 131
vardha
  • 414
  • 7
  • 20

2 Answers2

5

You could use anti_join from dplyr

 library(dplyr)
 anti_join(df1, df2, by = c('col1', 'col2'))
 #      col1  col2       col3
 #1     delta 4.exe -0.5836272
 #2    vmware c.exe  0.4196231
 #3     apple b.exe  0.5365853
 #4 microsoft a.exe -0.5458808

data

 set.seed(24)
 df1 <- data.frame(col1 = c('microsoft', 'apple', 'vmware', 'delta', 
 'microsoft'), col2= c('a.exe', 'b.exe', 'c.exe', '4.exe', 'asd.exe'), 
    col3=rnorm(5), stringsAsFactors=FALSE)
 set.seed(22)
 df2 <- data.frame(col1 = c( 'apple', 'cisco', 'proactive', 'dtex', 
 'microsoft'), col2= c('a.exe', 'b.exe', 'c.exe', '4.exe', 'asd.exe'), 
  col3=rnorm(5), stringsAsFactors=FALSE)
akrun
  • 874,273
  • 37
  • 540
  • 662
4

data.table solution inspired by this:

library(data.table) #1.9.5+
setDT(dataset1,key=c("col1","col2"))
setDT(dataset2,key=key(dataset1))
dataset1[!dataset2]

        col1  col2 col3
1:     apple b.exe    1
2:     delta 4.exe    4
3: microsoft a.exe    2
4:    vmware c.exe    3

You could also try without keying:

library(data.table) #1.9.5+
setDT(dataset1); setDT(dataset2)
dataset1[!dataset2,on=c("col1","col2")]
Community
  • 1
  • 1
MichaelChirico
  • 33,841
  • 14
  • 113
  • 198