3

I have a txt file which contains 20 columns and 300 rows. The sample of my data is given below.

id  sub     A1                      A2      B1           B2                    C1   
96  AAA 01:01:01:01/01:01:01:02N        29:02:01    08:01:01/08:19N 44:03:01/44:03:03/44:03:04  07:01:01/07:01:02
97  AAA 03:01:01:01/03:01:01:02N        30:08:01    09:02:01/08:19N 44:03:01/44:03:03/44:03:04  07:01:01/07:01:02
98 AAA  01:01:01:01/01:01:01:02N/01:22N 29:02:01    08:01:01/08:19N 44:03:01/44:03:03/44:03:04  07:09:01/07:01:02
99  AAA 03:01:01:01                     30:08:01    09:02:01/08:19N 44:03:01/44:03:03/44:03:04  07:08:01/07:01:02 

I need to seperate the columns (A1,A2,B1....) with the seperator "/" using r. The output would be:

   id   sub A1_1      A1_2         A2       B1_1     B1_2    B2_1  B2_2   ..
96  AAA 01:01:01:01   01:01:01:02N      29:02:01    08:01:01     08:19N      44:03:01  44:03:03   44:03:04  ...

I could find functions to split one columns into multiple columns. But I could not find a solution to achieve this.

4 Answers4

8

Here is a tidyverse solution.

library(tidyverse)
df %>% 
 gather(key, value, -c(1:2)) %>% 
 separate_rows(value, sep = "/") %>% 
 group_by(key, id) %>% 
 mutate(key2 = paste0(key, "_", seq_along(key))) %>%
 ungroup() %>% 
 select(-key) %>% 
 spread(key2, value)

# A tibble: 4 x 13
# id      sub   A1_1    A1_2     A1_3 A2_1 B1_1 B1_2 B2_1 B2_2 B2_3 C1_1 C1_2
#* <fct>   <fct> <chr>       <chr>        <chr>    <chr>    <chr>    <chr>    <chr>    <chr>    <chr>    <chr>    <chr>   
#1 96 AAA   01:01:01:01 01:01:01:02N <NA>     29:02:01 08:01:01 08:19N   44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
#2 97 AAA   03:01:01:01 03:01:01:02N <NA>     30:08:01 09:02:01 08:19N   44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
#3 98 AAA   01:01:01:01 01:01:01:02N 01:22N   29:02:01 08:01:01 08:19N   44:03:01 44:03:03 44:03:04 07:09:01 07:01:02
#4 99 AAA   03:01:01:01 <NA>         <NA>     30:08:01 09:02:01 08:19N   44:03:01 44:03:03 44:03:04 07:08:01 07:01:02

After gathering columns all columns except the first and the second (-c(1:2)), I used tidyr::separate_rows to separate the values in newly created column value by "/". After creating a new column key2 which is column key with the extension _1:number of separators, I unselected column key and spread column key2 by value.

data

df <- structure(list(id = structure(1:4, .Label = c("96", "97", 
"98", "99"), class = "factor"), sub = structure(c(1L, 
1L, 1L, 1L), .Label = "AAA", class = "factor"), A_A1 = structure(c(1L, 
4L, 2L, 3L), .Label = c("01:01:01:01/01:01:01:02N", "01:01:01:01/01:01:01:02N/01:22N", 
"03:01:01:01", "03:01:01:01/03:01:01:02N"), class = "factor"), 
A_A2 = structure(c(1L, 2L, 1L, 2L), .Label = c("29:02:01", 
"30:08:01"), class = "factor"), B_B1 = structure(c(1L, 
2L, 1L, 2L), .Label = c("08:01:01/08:19N", "09:02:01/08:19N"
), class = "factor"), B_B2 = structure(c(1L, 1L, 1L, 1L
), .Label = "44:03:01/44:03:03/44:03:04", class = "factor"), 
C1 = structure(c(1L, 1L, 3L, 2L), .Label = c("07:01:01/07:01:02", 
"07:08:01/07:01:02", "07:09:01/07:01:02"), class = "factor")), .Names = c("id", 
"sub", "A_A1", "A_A2", "B_B1", "B_B2", "C_C1"), class = "data.frame", row.names = c(NA, 
-4L))
markus
  • 25,843
  • 5
  • 39
  • 58
3

I'd take an approach like the following:

library(data.table)
setDT(df) # convert to a data.table

# identify the columns you want to split
cols <- grep("^HLA", names(df), value = TRUE)

# loop through them and split them
# assign them back to the data.table, by reference
for (i in cols) {
  temp <- tstrsplit(df[[i]], "/")
  set(df, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
  set(df, j = i, value = NULL)
}

Here's the result:

df[]
#         id sub    HLA_A1_1     HLA_A1_2 HLA_A1_3 HLA_A2_1 HLA_B1_1 HLA_B1_2 HLA_B2_1 HLA_B2_2 HLA_B2_3 HLA_C1_1 HLA_C1_2
# 1: HG00096 GBR 01:01:01:01 01:01:01:02N       NA 29:02:01 08:01:01   08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
# 2: HG00097 GBR 03:01:01:01 03:01:01:02N       NA 30:08:01 09:02:01   08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
# 3: HG00098 GBR 01:01:01:01 01:01:01:02N   01:22N 29:02:01 08:01:01   08:19N 44:03:01 44:03:03 44:03:04 07:09:01 07:01:02
# 4: HG00099 GBR 03:01:01:01           NA       NA 30:08:01 09:02:01   08:19N 44:03:01 44:03:03 44:03:04 07:08:01 07:01:02

Aside from being easier to scale than the accepted answer (things aren't really hard-coded), this is at least twice as fast as that approach, and a lot faster than the "tidyverse" approach, which is quite inefficient because it first makes the data very long before going back into a wide format.


Benchmarks

To get a sense of the performance difference, try the following:

Test functions

myfun <- function(df) {
  cols <- grep("^HLA", names(df), value = TRUE)
  for (i in cols) {
    temp <- tstrsplit(df[[i]], "/")
    set(df, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
    set(df, j = i, value = NULL)
  }
  df[]
}

tidyfun <- function(df) {
  df %>% 
    gather(key, value, -c(1:2)) %>% 
    separate_rows(value, sep = "/") %>% 
    group_by(key, id) %>% 
    mutate(key2 = paste0(key, "_", seq_along(key))) %>%
    ungroup() %>% 
    select(-key) %>% 
    spread(key2, value)
}

getIt <- function(df,col) {    
  x <- max(sapply(strsplit(as.character(df[,col]),split="/"),length))
  q <- colsplit(string = as.character(df[,col]),pattern="/",
                names = paste0(names(df)[col],"_",LETTERS[1:x]))
  return(q)
}    

reshape2fun <- function(dfdf) {
  cbind(dfdf[,1:2], getIt(dfdf,3), getIt(dfdf,4), getIt(dfdf,5), getIt(dfdf,6))
}

4 rows....

library(microbenchmark)
dfdf <- as.data.frame(df)
microbenchmark(myfun(copy(df)), reshape2fun(dfdf), tidyfun(df))
# Unit: microseconds
#               expr      min         lq       mean    median         uq      max neval
#    myfun(copy(df))   241.55   272.5965   625.7634   359.148   380.0395 28632.94   100
#  reshape2fun(dfdf)  5076.24  5368.3835  5841.8784  5539.577  5639.8765 34176.13   100
#        tidyfun(df) 37864.68 39435.1915 41152.5916 39801.499 40489.7055 70019.04   100

10,000 rows....

biggerdf <- rbindlist(replicate(2500, df, FALSE)) # nrow = 10,000
dfdf <- as.data.frame(biggerdf)
microbenchmark(myfun(copy(biggerdf)), reshape2fun(dfdf), tidyfun(biggerdf), times = 10)
# Unit: milliseconds
#                   expr        min        lq       mean     median         uq        max neval
#  myfun(copy(biggerdf))   50.87452   52.0059   54.59288   53.03503   53.79347   68.69892    10
#      reshape2fun(dfdf)  120.90291  124.3893  137.54154  126.06213  157.50532  159.15069    10
#      tidyfun(biggerdf) 1312.75422 1350.6651 1394.93082 1358.21612 1373.86793 1732.86521    10

1,000,000 rows....

BIGGERdf <- rbindlist(replicate(100, biggerdf, FALSE)) # nrow = 1,000,000
dfdf <- as.data.frame(BIGGERdf)
system.time(tidyfun(BIGGERdf)) # > 2 minutes!
#    user  system elapsed 
# 141.373   1.048 142.403 

microbenchmark(myfun(copy(BIGGERdf)), reshape2fun(dfdf), times = 5)
# Unit: seconds
#                   expr      min       lq     mean   median        uq       max neval
#  myfun(copy(BIGGERdf)) 5.180048 5.574677 6.026515 5.764467  6.498967  7.114415     5
#      reshape2fun(dfdf) 8.858202 9.095027 9.629969 9.264896 10.192161 10.739560     5
A5C1D2H2I1M1N2O1R2T1
  • 190,393
  • 28
  • 405
  • 485
  • This is obviously a really good solution, though I have to admit that it's hard for me to grasp what's going on in the loop. (+1) – markus Feb 11 '18 at 09:33
  • 1
    @markus, the `set()` function in "data.table" lets you replace values without copying. So, starting from the columns that need to be split (`"cols"`), we (1) split them using `tstrsplit()` which creates a list with each item being the same length (so they can be seen as columns). (2) Using the length of the list, we specify the names of the columns we're creating in the "j" argument, and the values to insert in the "value" argument. (3) Using the "j" argument in another call to `set()`, we also remove the original column that was split. Repeat for each column in the loop. – A5C1D2H2I1M1N2O1R2T1 Feb 11 '18 at 11:46
2

I suggest a reshape2 solution taking care of not knowing the number of parts:

> dput(pz1)
structure(list(id = c("HG00096", "HG00097", "HG00098", "HG00099"
), sub = c("GBR", "GBR", "GBR", "GBR"), HLA_A1 = c("01:01:01:01/01:01:01:02N", 
"03:01:01:01/03:01:01:02N", "01:01:01:01/01:01:01:02N/01:22N", 
"03:01:01:01"), HLA_A2 = c("29:02:01", "30:08:01", "29:02:01", 
"30:08:01"), HLA_B1 = c("08:01:01/08:19N", "09:02:01/08:19N", 
"08:01:01/08:19N", "09:02:01/08:19N"), HLA_B2 = c("44:03:01/44:03:03/44:03:04", 
"44:03:01/44:03:03/44:03:04", "44:03:01/44:03:03/44:03:04", "44:03:01/44:03:03/44:03:04"
), HLA_C1 = c("07:01:01/07:01:02", "07:01:01/07:01:02", "07:09:01/07:01:02", 
"07:08:01/07:01:02")), .Names = c("id", "sub", "HLA_A1", "HLA_A2", 
"HLA_B1", "HLA_B2", "HLA_C1"), row.names = c(NA, -4L), class = "data.frame")

add this function:

library("reshape2", lib.loc="~/R/win-library/3.3")

getIt <- function(df,col) {    
x <- max(sapply(strsplit(df[,col],split="/"),length))   ### get the max parts for column
q <- colsplit(string = df[,col],pattern="/",names = paste0(names(df)[col],"_",LETTERS[1:x]))
return(q) }

after you have this function you can easily do:

> getIt(pz1,3)
     HLA_A1_A     HLA_A1_B HLA_A1_C
1 01:01:01:01 01:01:01:02N         
2 03:01:01:01 03:01:01:02N         
3 01:01:01:01 01:01:01:02N   01:22N
4 03:01:01:01                      

and a simple cbind with the original dataframe (with or without the original columns) :

> cbind(pz1[,1:2],getIt(pz1,3),getIt(pz1,4),getIt(pz1,5),getIt(pz1,6))
       id sub    HLA_A1_A     HLA_A1_B HLA_A1_C HLA_A2_A HLA_B1_A HLA_B1_B HLA_B2_A HLA_B2_B HLA_B2_C
1 HG00096 GBR 01:01:01:01 01:01:01:02N          29:02:01 08:01:01   08:19N 44:03:01 44:03:03 44:03:04
2 HG00097 GBR 03:01:01:01 03:01:01:02N          30:08:01 09:02:01   08:19N 44:03:01 44:03:03 44:03:04
3 HG00098 GBR 01:01:01:01 01:01:01:02N   01:22N 29:02:01 08:01:01   08:19N 44:03:01 44:03:03 44:03:04
4 HG00099 GBR 03:01:01:01                       30:08:01 09:02:01   08:19N 44:03:01 44:03:03 44:03:04
Zahiro Mor
  • 1,708
  • 1
  • 16
  • 30
0

I second @Sotos advice, it is important to write a reproducible example so the focus is only on the problem at hand.

I came up with this fake data to try to answer your question:

> df <- data.frame(
+   id = c(1:5),
+   sub = sample(c("GBR", "BRA"), size = 5, replace = T),
+   HLA_A = paste0(rep("01:01", 5), "/", rep("01:02N")), 
+   HLA_B = paste0(rep("01:03", 5), "/", "01:42N", "/", "32:20"), 
+   HLA_C = paste0(rep("01:03", 5)), stringsAsFactors = F)
> 
> 
> df
  id sub        HLA_A              HLA_B HLA_C
1  1 GBR 01:01/01:02N 01:03/01:42N/32:20 01:03
2  2 BRA 01:01/01:02N 01:03/01:42N/32:20 01:03
3  3 GBR 01:01/01:02N 01:03/01:42N/32:20 01:03
4  4 GBR 01:01/01:02N 01:03/01:42N/32:20 01:03
5  5 BRA 01:01/01:02N 01:03/01:42N/32:20 01:03

You can use strsplit() to split the column by a given character (in this case "/"). Use do.call(rbind, .) to bind the lists in column format. Repeat this process for the columns you wish to target and them bind them all with the id and sub columns. Here is the solution:

Without using any dependencies:

> col.ind <- grep(x = names(df), pattern = "HLA", value = T, ignore.case = T) # your target columns
> 
> # lapply to loop the column split process, output is a list, so you need to columb-bind the resulting objects
> 
> cols.list <- lapply(seq_along(col.ind), function(x){
+ 
+   p1 <- do.call(rbind, strsplit(df[[col.ind[[x]]]], split = "/")) # split col by "/" 
+   
+   p2 <- data.frame(p1, stringsAsFactors = F)  # make it into a data.frame
+   
+   i <- ncol(p2) # this is an index placeholder that will enable you to rename the recently split columns in a sequential manner
+   
+   colnames(p2) <- paste0(col.ind[[x]], c(1:i)) # rename columns 
+   
+   return(p2) # return the object of interest
+ }
+ )
> 
> 
> new.df <- cbind(df[1:2], do.call(cbind, cols.list)) # do.call once again to bind the lapply object and column-bind those with the first two columns of your initial data.frame
> new.df
  id sub HLA_A1 HLA_A2 HLA_B1 HLA_B2 HLA_B3 HLA_C1
1  1 GBR  01:01 01:02N  01:03 01:42N  32:20  01:03
2  2 BRA  01:01 01:02N  01:03 01:42N  32:20  01:03
3  3 GBR  01:01 01:02N  01:03 01:42N  32:20  01:03
4  4 GBR  01:01 01:02N  01:03 01:42N  32:20  01:03
5  5 BRA  01:01 01:02N  01:03 01:42N  32:20  01:03
JdeMello
  • 1,708
  • 15
  • 23
  • Thank you for the solution. I got the following warning: Warning messages: 1: In (function (..., deparse.level = 1) : number of columns of result is not a multiple of vector length (arg 1) 1 –  Feb 09 '18 at 00:22