Match two datasets with multiple appearance of the same gene

Question

I have two datasets

set1 <- structure(list(gene = c("ENSG00000003096", "ENSG00000011677", "ENSG00000019169", 
"ENSG00000022556", "ENSG00000029534"), bM = c(5069.84617263404, 
1339.17994216287, 38.6160408376658, 589.853084670642, 2805.5926601769
), fold = c(2.85993763274585, 4.90104282563152, 5.16621421186109, 
11.1359115874284, 2.95328623121562), value = c(7.37447235027197e-07, 
2.76009547949742e-10, 1.56142531487242e-10, 1.97711024002255e-21, 
8.60063581030308e-12), symbol = c("KLHL13", "GABPA", "MARCO", "NLRP2", 
"ANK1")), class = "data.frame", row.names = c(NA, -5L))

set2 <- structure(list(symbol = c("KLHL13", "KLHL13", "KLHL13", "GABPA", 
"GABPA", "GABPA", "MARCO", "MARCO"), geneID = c(90293L, 90293L, 90293L, 
2551L, 2551L, 2551L, 8685L, 8685L), pathway = c("Class I MHC mediated antigen processing & presentation", 
"Immune System", "Ubiquitin mediated proteolysis", "Mitochondrial biogenesis", 
"Organelle biogenesis and maintenance", "Transcriptional activation of mitochondrial biogenesis", 
"Binding and Uptake of Ligands by Scavenger Receptors", "Phagosome"
), pathwayID = c("REACT:R-HSA-983169", "REACT:R-HSA-168256", "KEGG:hsa04120", 
"REACT:R-HSA-1592230", "REACT:R-HSA-1852241", "REACT:R-HSA-2151201", 
"REACT:R-HSA-2173782", "KEGG:hsa04145")), class = "data.frame", row.names = c(NA, 
-8L))

My desired output is;

ENSG00000003096 5069.84617263404    2.85993763274585    7.37447235027197e-07    KLHL13  KLHL13  90293   Class I MHC mediated antigen processing & presentation  REACT:R-HSA-983169
ENSG00000003096 5069.84617263404    2.85993763274585    7.37447235027197e-07    KLHL13  KLHL13  90293   Immune System   REACT:R-HSA-168256
ENSG00000003096 5069.84617263404    2.85993763274585    7.37447235027197e-07    KLHL13  KLHL13  90293   Ubiquitin mediated proteolysis  KEGG:hsa04120
ENSG00000011677 1339.17994216287    4.90104282563152    2.76009547949742e-10    GABPA  GABPA    2551    Mitochondrial biogenesis    REACT:R-HSA-1592230
ENSG00000011677 1339.17994216287    4.90104282563152    2.76009547949742e-10    GABPA  GABPA  Organelle biogenesis and maintenance  REACT:R-HSA-1852241
ENSG00000011677 1339.17994216287    4.90104282563152    2.76009547949742e-10    GABPA  GABPA    2551    Transcriptional activation of mitochondrial biogenesis  REACT:R-HSA-2151201
ENSG00000019169 38.6160408376658    5.16621421186109    1.56142531487242e-10    MARCO  MARCO    8685    Binding and Uptake of Ligands by Scavenger Receptors    REACT:R-HSA-2173782
ENSG00000019169 38.6160408376658    5.16621421186109    1.56142531487242e-10    MARCO  MARCO    8685    Phagosome   KEGG:hsa04145
ENSG00000022556 589.853084670642    11.1359115874284    1.97711024002255e-21    NLRP2  NA  NA  NA  NA
ENSG00000029534 2805.5926601769 2.95328623121562    8.60063581030308e-12    ANK1  NA  NA  NA  NA

If I use merge, I lose the genes that have no match. If I use match I get only one of the symbols match. How to get the output I want?

1. merge(set1, set2, by=c("symbol"))
2. set1[, (ncol(set1)+1):((ncol(set1))+ncol(set2))]<- set2[match(set1$symbol, set2$symbol), ]

please provide us with the dataset using `dput(set1)` and `dput(set2)` — DeBARtha, Nov 30 '21 at 14:59
you probably want a `dplyr::full_join`, i.e., `dplyr::full_join(set1, set2, by = "symbol")` — tjebo, Nov 30 '21 at 15:16

score 1 · Accepted Answer · answered Nov 30 '21 at 15:22

Dplyr:: left_join or full_join will do the job:

> full_join(set1,set2,by="symbol")
              gene         bM      fold        value symbol geneID                                                pathway
1  ENSG00000003096 5069.84617  2.859938 7.374472e-07 KLHL13  90293 Class I MHC mediated antigen processing & presentation
2  ENSG00000003096 5069.84617  2.859938 7.374472e-07 KLHL13  90293                                          Immune System
3  ENSG00000003096 5069.84617  2.859938 7.374472e-07 KLHL13  90293                         Ubiquitin mediated proteolysis
4  ENSG00000011677 1339.17994  4.901043 2.760095e-10  GABPA   2551                               Mitochondrial biogenesis
5  ENSG00000011677 1339.17994  4.901043 2.760095e-10  GABPA   2551                   Organelle biogenesis and maintenance
6  ENSG00000011677 1339.17994  4.901043 2.760095e-10  GABPA   2551 Transcriptional activation of mitochondrial biogenesis
7  ENSG00000019169   38.61604  5.166214 1.561425e-10  MARCO   8685   Binding and Uptake of Ligands by Scavenger Receptors
8  ENSG00000019169   38.61604  5.166214 1.561425e-10  MARCO   8685                                              Phagosome
9  ENSG00000022556  589.85308 11.135912 1.977110e-21  NLRP2     NA                                                   <NA>
10 ENSG00000029534 2805.59266  2.953286 8.600636e-12   ANK1     NA                                                   <NA>
             pathwayID
1   REACT:R-HSA-983169
2   REACT:R-HSA-168256
3        KEGG:hsa04120
4  REACT:R-HSA-1592230
5  REACT:R-HSA-1852241
6  REACT:R-HSA-2151201
7  REACT:R-HSA-2173782
8        KEGG:hsa04145
9                 <NA>
10                <NA>

> left_join(set1,set2,by="symbol")
              gene         bM      fold        value symbol geneID                                                pathway
1  ENSG00000003096 5069.84617  2.859938 7.374472e-07 KLHL13  90293 Class I MHC mediated antigen processing & presentation
2  ENSG00000003096 5069.84617  2.859938 7.374472e-07 KLHL13  90293                                          Immune System
3  ENSG00000003096 5069.84617  2.859938 7.374472e-07 KLHL13  90293                         Ubiquitin mediated proteolysis
4  ENSG00000011677 1339.17994  4.901043 2.760095e-10  GABPA   2551                               Mitochondrial biogenesis
5  ENSG00000011677 1339.17994  4.901043 2.760095e-10  GABPA   2551                   Organelle biogenesis and maintenance
6  ENSG00000011677 1339.17994  4.901043 2.760095e-10  GABPA   2551 Transcriptional activation of mitochondrial biogenesis
7  ENSG00000019169   38.61604  5.166214 1.561425e-10  MARCO   8685   Binding and Uptake of Ligands by Scavenger Receptors
8  ENSG00000019169   38.61604  5.166214 1.561425e-10  MARCO   8685                                              Phagosome
9  ENSG00000022556  589.85308 11.135912 1.977110e-21  NLRP2     NA                                                   <NA>
10 ENSG00000029534 2805.59266  2.953286 8.600636e-12   ANK1     NA                                                   <NA>
             pathwayID
1   REACT:R-HSA-983169
2   REACT:R-HSA-168256
3        KEGG:hsa04120
4  REACT:R-HSA-1592230
5  REACT:R-HSA-1852241
6  REACT:R-HSA-2151201
7  REACT:R-HSA-2173782
8        KEGG:hsa04145
9                 <NA>
10                <NA>

You can also use plyr::join_all

> join_all(list(set1,set2),by="symbol")
              gene         bM      fold        value symbol geneID                                                pathway
1  ENSG00000003096 5069.84617  2.859938 7.374472e-07 KLHL13  90293 Class I MHC mediated antigen processing & presentation
2  ENSG00000003096 5069.84617  2.859938 7.374472e-07 KLHL13  90293                                          Immune System
3  ENSG00000003096 5069.84617  2.859938 7.374472e-07 KLHL13  90293                         Ubiquitin mediated proteolysis
4  ENSG00000011677 1339.17994  4.901043 2.760095e-10  GABPA   2551                               Mitochondrial biogenesis
5  ENSG00000011677 1339.17994  4.901043 2.760095e-10  GABPA   2551                   Organelle biogenesis and maintenance
6  ENSG00000011677 1339.17994  4.901043 2.760095e-10  GABPA   2551 Transcriptional activation of mitochondrial biogenesis
7  ENSG00000019169   38.61604  5.166214 1.561425e-10  MARCO   8685   Binding and Uptake of Ligands by Scavenger Receptors
8  ENSG00000019169   38.61604  5.166214 1.561425e-10  MARCO   8685                                              Phagosome
9  ENSG00000022556  589.85308 11.135912 1.977110e-21  NLRP2     NA                                                   <NA>
10 ENSG00000029534 2805.59266  2.953286 8.600636e-12   ANK1     NA                                                   <NA>
             pathwayID
1   REACT:R-HSA-983169
2   REACT:R-HSA-168256
3        KEGG:hsa04120
4  REACT:R-HSA-1592230
5  REACT:R-HSA-1852241
6  REACT:R-HSA-2151201
7  REACT:R-HSA-2173782
8        KEGG:hsa04145
9                 <NA>
10                <NA>

Match two datasets with multiple appearance of the same gene

1 Answers1