1

I have the attached dataframe.

DATA

structure(list(associated_gene = c(NA, NA, "A4GALT", NA, NA, 
   "NOT FOUND"), chr_name = c("22", "22", "22", "22", "22", "NOT FOUND"
   ), chrom_start = c(42693910L, 42693843L, 42693321L, 42693665L, 
   42693653L, 0L), allele = c("G/A/T", "T/C", "G/C", "C/T", "G/A/T", 
   "NOT FOUND"), refsnp_id = c("rs778598915", "rs11541159", "rs397514502", 
   "rs762949801", "rs776304817", "NOT FOUND")), row.names = c("s3a", 
   "s3b", "s3c", "s3d", "s3e", "s3f"), class = "data.frame")
    associated_gene  chr_name chrom_start    allele   refsnp_id
s3a            <NA>        22    42693910     G/A/T rs778598915
s3b            <NA>        22    42693843       T/C  rs11541159
s3c          A4GALT        22    42693321       G/C rs397514502
s3d            <NA>        22    42693665       C/T rs762949801
s3e            <NA>        22    42693653     G/A/T rs776304817
s3f       NOT FOUND NOT FOUND           0 NOT FOUND   NOT FOUND

I would like to split the allele column by the first "/" into two (Ref & Var) and insert them between $chrom_start and $refsnp_id

The ideal output is:

     associated_gene  chr_name chrom_start   Ref   Var   refsnp_id
s3a            <NA>         22    42693910     G   A/T rs778598915
s3b            <NA>         22    42693843     T     C  rs11541159

I dont know if I can load awk, but in bash I'd do:

cat allele | awk -F"/" '{print $1 "\t" $2}'

Shahin
  • 1,196
  • 1
  • 8
  • 15

3 Answers3

1

We can use extract from tidyr to capture the characters that are not a / ([^/]+) from the start (^) of the string followed by a / and then capture the rest of the characters

library(tidyr)
library(dplyr)
df1 %>%
     extract(allele, into = c("Ref", "Var"), "^([^/]+)/(.*)")
#   associated_gene  chr_name chrom_start  Ref  Var   refsnp_id
#s3a            <NA>        22    42693910    G  A/T rs778598915
#s3b            <NA>        22    42693843    T    C  rs11541159
#s3c          A4GALT        22    42693321    G    C rs397514502
#s3d            <NA>        22    42693665    C    T rs762949801
#s3e            <NA>        22    42693653    G  A/T rs776304817
#s3f       NOT FOUND NOT FOUND           0 <NA> <NA>   NOT FOUND

Or another option is str_split

library(stringr)
do.call(rbind, str_split(df$allele, "/", 2))

Or create a delimiter with sub and read with read.table/read.csv in base R

df1[c("Ref", "Var")] <- read.table(text = sub("/", ";", df1$allele, 
    fixed = TRUE), header = FALSE, sep = ";", 
   stringsAsFactors = FALSE, fill = TRUE, na.strings = c("NOT FOUND", ""))
akrun
  • 874,273
  • 37
  • 540
  • 662
1

Another solution is by using the "stringr" package:

install.packages("stringr")
library(stringr)

Data:

df <- structure(list(associated_gene = c(NA, NA, "A4GALT", NA, NA, 
                               "NOT FOUND"), chr_name = c("22", "22", "22", "22", "22", "NOT FOUND"
                               ), chrom_start = c(42693910L, 42693843L, 42693321L, 42693665L, 
                                                  42693653L, 0L), allele = c("G/A/T", "T/C", "G/C", "C/T", "G/A/T", 
                                                                             "NOT FOUND"), refsnp_id = c("rs778598915", "rs11541159", "rs397514502", 
                                                                                                         "rs762949801", "rs776304817", "NOT FOUND")), row.names = c("s3a", "s3b", "s3c", "s3d", "s3e", "s3f"), class = "data.frame")

Create a new df containing the two new variables:

new_df <- data.frame(
Ref = str_extract(df$allele, "\\w(?=/)"), 
Var = str_extract(df$allele, "(?<=/)\\w.*")
  )
new_df
   Ref  Var
1    G  A/T
2    T    C
3    G    C
4    C    T
5    G  A/T
6 <NA> <NA>

And column-bind new_dfwith df (subtracting the now-obsolete allele column):

cbind(df[,-4], new_df)
    associated_gene  chr_name chrom_start   refsnp_id  Ref  Var
s3a            <NA>        22    42693910 rs778598915    G  A/T
s3b            <NA>        22    42693843  rs11541159    T    C
s3c          A4GALT        22    42693321 rs397514502    G    C
s3d            <NA>        22    42693665 rs762949801    C    T
s3e            <NA>        22    42693653 rs776304817    G  A/T
s3f       NOT FOUND NOT FOUND           0   NOT FOUND <NA> <NA>
Chris Ruehlemann
  • 20,321
  • 4
  • 12
  • 34
1

Depending on how you want your output for the last row, you can use one of the following :

Using tidyr::separate

library(tidyr)
separate(df, allele,into = c("Ref", "Var"),sep = "/",extra = "merge",fill = "right")

#   associated_gene  chr_name chrom_start        Ref  Var   refsnp_id
#s3a            <NA>        22    42693910         G  A/T rs778598915
#s3b            <NA>        22    42693843         T    C  rs11541159
#s3c          A4GALT        22    42693321         G    C rs397514502
#s3d            <NA>        22    42693665         C    T rs762949801
#s3e            <NA>        22    42693653         G  A/T rs776304817
#s3f       NOT FOUND NOT FOUND           0 NOT FOUND <NA>   NOT FOUND

OR with stringr::str_match

stringr::str_match(df$allele, "(.*?)/(.*)")[,c(2, 3)]

#     [,1] [,2] 
#[1,] "G"  "A/T"
#[2,] "T"  "C"  
#[3,] "G"  "C"  
#[4,] "C"  "T"  
#[5,] "G"  "A/T"
#[6,] NA   NA   
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213