1

I have two data set df1 and df2. How can I remove gene names found in df2 from df1.

df1<-

chr   start   end     CNA       Genes                  No.of.Gene
   1    13991   1401    gain    Cfh,Gm26048,Bhis,Sclm       2
   1    14011   1490    gain    Zfp788,Rik                  2

df2 <-

       Genes
      Gm26048
        Gif
        Tl2
        Rik

expected output

           chr   start   end     CNA    Genes                No.of.Gene
           1    13991   1401    gain     Cfh,Bhis,Sclm              2
           1    14011   1490    gain    Zfp788                      2
zx8754
  • 52,746
  • 12
  • 114
  • 209
beginner
  • 411
  • 1
  • 5
  • 13

4 Answers4

3

You can use,

df1$Genes <- sapply(strsplit(as.character(df1$Genes), ','), function(i)
                                                     setdiff(i, df2$Genes))

df1
#  chr start  end  CNA  Genes No.of.Gene
#1   1 13991 1401 gain    Cfh          2
#2   1 14011 1490 gain Zfp788          2

EDIT

After you changed df1, to get the expected result then

sapply(sapply(strsplit(as.character(df1$Genes), ','), function(i) 
                              setdiff(i, df2$Genes)), paste, collapse = ',')
#[1] "Cfh,Bhis,Sclm" "Zfp788"
Sotos
  • 51,121
  • 6
  • 32
  • 66
3

Another option is using gsub

df1$Genes <- gsub(",(?=,)|,$", "", gsub(paste0("(", paste(df2$Genes, 
              collapse="|"), ")"), "", df1$Genes), perl= TRUE)
df1$Genes
#[1] "Cfh,Bhis,Sclm" "Zfp788"  
akrun
  • 874,273
  • 37
  • 540
  • 662
3

We can convert Genes column into rows, then use filter:

#data
df1 <- read.table(text = "
chr   start   end     CNA       Genes                  No.of.Gene
1    13991   1401    gain    Cfh,Gm26048,Bhis,Sclm       2
1    14011   1490    gain    Zfp788,Rik                  2", header = TRUE)
df2 <- read.table(text = "
Genes
Gm26048
Gif
Tl2
Rik", header = TRUE)

library(dplyr)
library(tidyr)

# filter matching genes - intersect    
df1 %>% 
  mutate(Gene = strsplit(as.character(Genes), ",")) %>%
  unnest(Gene) %>% 
  filter(Gene %in% df2$Genes)

#     chr start   end    CNA                 Genes No.of.Gene    Gene
#   (int) (int) (int) (fctr)                (fctr)      (int)   (chr)
# 1     1 13991  1401   gain Cfh,Gm26048,Bhis,Sclm          2 Gm26048
# 2     1 14011  1490   gain            Zfp788,Rik          2     Rik

# filter non-matching genes - setdiff
df1 %>% 
  mutate(Gene = strsplit(as.character(Genes), ",")) %>%
  unnest(Gene) %>% 
  filter(!Gene %in% df2$Genes)

#     chr start   end    CNA                 Genes No.of.Gene   Gene
#    (int) (int) (int) (fctr)                (fctr)      (int)  (chr)
# 1     1 13991  1401   gain Cfh,Gm26048,Bhis,Sclm          2    Cfh
# 2     1 13991  1401   gain Cfh,Gm26048,Bhis,Sclm          2   Bhis
# 3     1 13991  1401   gain Cfh,Gm26048,Bhis,Sclm          2   Sclm
# 4     1 14011  1490   gain            Zfp788,Rik          2 Zfp788
zx8754
  • 52,746
  • 12
  • 114
  • 209
0

Pattern 1 (pattn1) will take care of removing the genes listed in df2, while pattn2 will remove any trailing commas:

pattn1 <- paste0(df2$Genes, collapse=",?|")
df1$Genes <- str_replace_all(df1$Genes, pattn1, "")
pattn2 <- c("^,|,$")
df1$Genes <- str_replace_all(df1$Genes, pattn2, "")

## Results ##

  chr start  end  CNA         Genes No.of.Gene
1   1 13991 1401 gain Cfh,Bhis,Sclm          2
2   1 14011 1490 gain        Zfp788          2
Dominic Comtois
  • 10,230
  • 1
  • 39
  • 61