I'd take an approach like the following:
library(data.table)
setDT(df) # convert to a data.table
# identify the columns you want to split
cols <- grep("^HLA", names(df), value = TRUE)
# loop through them and split them
# assign them back to the data.table, by reference
for (i in cols) {
temp <- tstrsplit(df[[i]], "/")
set(df, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
set(df, j = i, value = NULL)
}
Here's the result:
df[]
# id sub HLA_A1_1 HLA_A1_2 HLA_A1_3 HLA_A2_1 HLA_B1_1 HLA_B1_2 HLA_B2_1 HLA_B2_2 HLA_B2_3 HLA_C1_1 HLA_C1_2
# 1: HG00096 GBR 01:01:01:01 01:01:01:02N NA 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
# 2: HG00097 GBR 03:01:01:01 03:01:01:02N NA 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:01:01 07:01:02
# 3: HG00098 GBR 01:01:01:01 01:01:01:02N 01:22N 29:02:01 08:01:01 08:19N 44:03:01 44:03:03 44:03:04 07:09:01 07:01:02
# 4: HG00099 GBR 03:01:01:01 NA NA 30:08:01 09:02:01 08:19N 44:03:01 44:03:03 44:03:04 07:08:01 07:01:02
Aside from being easier to scale than the accepted answer (things aren't really hard-coded), this is at least twice as fast as that approach, and a lot faster than the "tidyverse" approach, which is quite inefficient because it first makes the data very long before going back into a wide format.
Benchmarks
To get a sense of the performance difference, try the following:
Test functions
myfun <- function(df) {
cols <- grep("^HLA", names(df), value = TRUE)
for (i in cols) {
temp <- tstrsplit(df[[i]], "/")
set(df, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
set(df, j = i, value = NULL)
}
df[]
}
tidyfun <- function(df) {
df %>%
gather(key, value, -c(1:2)) %>%
separate_rows(value, sep = "/") %>%
group_by(key, id) %>%
mutate(key2 = paste0(key, "_", seq_along(key))) %>%
ungroup() %>%
select(-key) %>%
spread(key2, value)
}
getIt <- function(df,col) {
x <- max(sapply(strsplit(as.character(df[,col]),split="/"),length))
q <- colsplit(string = as.character(df[,col]),pattern="/",
names = paste0(names(df)[col],"_",LETTERS[1:x]))
return(q)
}
reshape2fun <- function(dfdf) {
cbind(dfdf[,1:2], getIt(dfdf,3), getIt(dfdf,4), getIt(dfdf,5), getIt(dfdf,6))
}
4 rows....
library(microbenchmark)
dfdf <- as.data.frame(df)
microbenchmark(myfun(copy(df)), reshape2fun(dfdf), tidyfun(df))
# Unit: microseconds
# expr min lq mean median uq max neval
# myfun(copy(df)) 241.55 272.5965 625.7634 359.148 380.0395 28632.94 100
# reshape2fun(dfdf) 5076.24 5368.3835 5841.8784 5539.577 5639.8765 34176.13 100
# tidyfun(df) 37864.68 39435.1915 41152.5916 39801.499 40489.7055 70019.04 100
10,000 rows....
biggerdf <- rbindlist(replicate(2500, df, FALSE)) # nrow = 10,000
dfdf <- as.data.frame(biggerdf)
microbenchmark(myfun(copy(biggerdf)), reshape2fun(dfdf), tidyfun(biggerdf), times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# myfun(copy(biggerdf)) 50.87452 52.0059 54.59288 53.03503 53.79347 68.69892 10
# reshape2fun(dfdf) 120.90291 124.3893 137.54154 126.06213 157.50532 159.15069 10
# tidyfun(biggerdf) 1312.75422 1350.6651 1394.93082 1358.21612 1373.86793 1732.86521 10
1,000,000 rows....
BIGGERdf <- rbindlist(replicate(100, biggerdf, FALSE)) # nrow = 1,000,000
dfdf <- as.data.frame(BIGGERdf)
system.time(tidyfun(BIGGERdf)) # > 2 minutes!
# user system elapsed
# 141.373 1.048 142.403
microbenchmark(myfun(copy(BIGGERdf)), reshape2fun(dfdf), times = 5)
# Unit: seconds
# expr min lq mean median uq max neval
# myfun(copy(BIGGERdf)) 5.180048 5.574677 6.026515 5.764467 6.498967 7.114415 5
# reshape2fun(dfdf) 8.858202 9.095027 9.629969 9.264896 10.192161 10.739560 5