To illustrate the problem see this example using dplyr/tidyr, answers appreciated using any other packages:
#data
df1 <- data.frame("SNP" = c("rs11807834", "rs3729986"),
"Symbols" = c("GRIN1,SETD1A", "MADD,STAC3,SPI1"))
# SNP Symbols
# 1 rs11807834 GRIN1,SETD1A
# 2 rs3729986 MADD,STAC3,SPI1
As expected Symbols column with multiple genes is gone:
df1 %>%
separate_rows(Symbols, sep = ",")
# # A tibble: 5 x 2
# SNP Symbols
# <chr> <chr>
# 1 rs11807834 GRIN1
# 2 rs11807834 SETD1A
# 3 rs3729986 MADD
# 4 rs3729986 STAC3
# 5 rs3729986 SPI1
I could join the original data to get the values
df1 %>%
separate_rows(Symbols, sep = ",") %>%
left_join(df1, by = "SNP")
# # A tibble: 5 x 3
# SNP Symbols.x Symbols.y
# <chr> <chr> <chr>
# 1 rs11807834 GRIN1 GRIN1,SETD1A
# 2 rs11807834 SETD1A GRIN1,SETD1A
# 3 rs3729986 MADD MADD,STAC3,SPI1
# 4 rs3729986 STAC3 MADD,STAC3,SPI1
# 5 rs3729986 SPI1 MADD,STAC3,SPI1
Or I could paste them back again using group/ungroup:
df1 %>%
separate_rows(Symbols, sep = ",") %>%
group_by(SNP) %>%
mutate(Genes = paste(Symbols, collapse = ",")) %>%
ungroup()
# # A tibble: 5 x 3
# SNP Symbols Genes
# <chr> <chr> <chr>
# 1 rs11807834 GRIN1 GRIN1,SETD1A
# 2 rs11807834 SETD1A GRIN1,SETD1A
# 3 rs3729986 MADD MADD,STAC3,SPI1
# 4 rs3729986 STAC3 MADD,STAC3,SPI1
# 5 rs3729986 SPI1 MADD,STAC3,SPI1
Am I missing something obvious? I'd hoped there would be something like:
# doesn't have such argument
df1 %>%
separate_rows(Symbols, sep = ",", keep = TRUE)