3

To illustrate the problem see this example using dplyr/tidyr, answers appreciated using any other packages:

#data
df1 <- data.frame("SNP" = c("rs11807834", "rs3729986"),
                  "Symbols" = c("GRIN1,SETD1A", "MADD,STAC3,SPI1"))
#          SNP         Symbols
# 1 rs11807834    GRIN1,SETD1A
# 2  rs3729986 MADD,STAC3,SPI1

As expected Symbols column with multiple genes is gone:

df1 %>% 
  separate_rows(Symbols, sep = ",") 
# # A tibble: 5 x 2
#   SNP        Symbols
#   <chr>      <chr>  
# 1 rs11807834 GRIN1  
# 2 rs11807834 SETD1A 
# 3 rs3729986  MADD   
# 4 rs3729986  STAC3  
# 5 rs3729986  SPI1 

I could join the original data to get the values

df1 %>% 
  separate_rows(Symbols, sep = ",") %>% 
  left_join(df1, by = "SNP")
# # A tibble: 5 x 3
#   SNP        Symbols.x Symbols.y      
#   <chr>      <chr>     <chr>          
# 1 rs11807834 GRIN1     GRIN1,SETD1A   
# 2 rs11807834 SETD1A    GRIN1,SETD1A   
# 3 rs3729986  MADD      MADD,STAC3,SPI1
# 4 rs3729986  STAC3     MADD,STAC3,SPI1
# 5 rs3729986  SPI1      MADD,STAC3,SPI1

Or I could paste them back again using group/ungroup:

df1 %>% 
  separate_rows(Symbols, sep = ",") %>% 
  group_by(SNP) %>% 
  mutate(Genes = paste(Symbols, collapse = ",")) %>% 
  ungroup()

#  # A tibble: 5 x 3
#   SNP        Symbols Genes          
#   <chr>      <chr>   <chr>          
# 1 rs11807834 GRIN1   GRIN1,SETD1A   
# 2 rs11807834 SETD1A  GRIN1,SETD1A   
# 3 rs3729986  MADD    MADD,STAC3,SPI1
# 4 rs3729986  STAC3   MADD,STAC3,SPI1
# 5 rs3729986  SPI1    MADD,STAC3,SPI1

Am I missing something obvious? I'd hoped there would be something like:

# doesn't have such argument
df1 %>% 
  separate_rows(Symbols, sep = ",", keep = TRUE)
zx8754
  • 52,746
  • 12
  • 114
  • 209

5 Answers5

4

Yes, the obvious solution is to just mutate a copy of Symbols into a column called Genes before separating its rows.

df1 %>%
  mutate(Genes = Symbols) %>%
  separate_rows(Symbols)
#> # A tibble: 5 x 3
#>   SNP        Symbols Genes          
#>   <chr>      <chr>   <chr>          
#> 1 rs11807834 GRIN1   GRIN1,SETD1A   
#> 2 rs11807834 SETD1A  GRIN1,SETD1A   
#> 3 rs3729986  MADD    MADD,STAC3,SPI1
#> 4 rs3729986  STAC3   MADD,STAC3,SPI1
#> 5 rs3729986  SPI1    MADD,STAC3,SPI1

Created on 2022-11-17 with reprex v2.0.2

Allan Cameron
  • 147,086
  • 7
  • 49
  • 87
3

You can also use cSplit, but it would also require creating a copy of the column. cSplit does have a parameter for drop; however, it can only be used on "wide" datasets (see SO answer).

library(splitstackshape)
library(data.table)

cSplit(data.table(df1)[, Symbols2 := Symbols], "Symbols", ",", "long", type.convert = "as.is")

Output

          SNP Symbols        Symbols2
1: rs11807834   GRIN1    GRIN1,SETD1A
2: rs11807834  SETD1A    GRIN1,SETD1A
3:  rs3729986    MADD MADD,STAC3,SPI1
4:  rs3729986   STAC3 MADD,STAC3,SPI1
5:  rs3729986    SPI1 MADD,STAC3,SPI1
AndrewGB
  • 16,126
  • 5
  • 18
  • 49
2

You can also do it this way:

library(data.table)
setDT(df1)[, .(Genes = strsplit(Symbols,",")[[1]]), SNP][df1,on="SNP"]

Output:

          SNP  Genes         Symbols
1: rs11807834  GRIN1    GRIN1,SETD1A
2: rs11807834 SETD1A    GRIN1,SETD1A
3:  rs3729986   MADD MADD,STAC3,SPI1
4:  rs3729986  STAC3 MADD,STAC3,SPI1
5:  rs3729986   SPI1 MADD,STAC3,SPI1
langtang
  • 22,248
  • 1
  • 12
  • 27
  • Yes, this is same as dplyr solution using joins in my question, but with data.table. – zx8754 Nov 18 '22 at 18:59
  • 1
    You may also include Symbols in the first `j`: `setDT(df1)[, .(Symbols, Genes = strsplit(Symbols,",")[[1]]), SNP]` - Symbols will get recycled to all rows within each SNP. – Henrik Nov 19 '22 at 09:04
2

Probably you can use unnest

df1 %>%
  mutate(Genes = strsplit(Symbols, ",")) %>%
  unnest(Genes)

which gives

# A tibble: 5 × 3
  SNP        Symbols         Genes
  <chr>      <chr>           <chr>
1 rs11807834 GRIN1,SETD1A    GRIN1
2 rs11807834 GRIN1,SETD1A    SETD1A
3 rs3729986  MADD,STAC3,SPI1 MADD
4 rs3729986  MADD,STAC3,SPI1 STAC3
5 rs3729986  MADD,STAC3,SPI1 SPI1

Or, another data.table option but with scan

setDT(df1)[
  ,
  Genes := Symbols
][
  ,
  .(Symbols = scan(text = Symbols, what = "", sep = ",", quiet = TRUE)),
  .(SNP, Genes)
]

which gives

          SNP           Genes Symbols
1: rs11807834    GRIN1,SETD1A   GRIN1
2: rs11807834    GRIN1,SETD1A  SETD1A
3:  rs3729986 MADD,STAC3,SPI1    MADD
4:  rs3729986 MADD,STAC3,SPI1   STAC3
5:  rs3729986 MADD,STAC3,SPI1    SPI1
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
1

Another solution based data.table:

library(data.table)

setDT(df1)[, .(Genes=strsplit(Symbols, ",")[[1]]), by=.(SNP, Symbols)]

          SNP         Symbols  Genes
       <char>          <char> <char>
1: rs11807834    GRIN1,SETD1A  GRIN1
2: rs11807834    GRIN1,SETD1A SETD1A
3:  rs3729986 MADD,STAC3,SPI1   MADD
4:  rs3729986 MADD,STAC3,SPI1  STAC3
5:  rs3729986 MADD,STAC3,SPI1   SPI1