I have a large datatable with the following format:
library(data.table)
library(string)
library(plyr)
dt <- data.table(
from = rep(LETTERS[1:10], each=3),
to = rep(letters[1:3], 10),
from_n = rep(sample(50:100, 10), each=3),
to_n = rep(sample(1:10, 3, replace = T), 10)
)
I am trying split my dataset into randomly sampled test and train datasets containing 80% and 20% of the observations ("from_n") for each group ("from").
I can currently do this using a for loop and creating a randomly shuffled vector. For example:
test_list <- list()
train_list <- list()
for (i in 1:length(unique(dt$from))){ # for each unique "from"
sub <- dt[from == unique(dt$from)[i]]
n_1 <- sub$to_n[1]
n_2 <- sub$to_n[2]
n_3 <- sub$to_n[3]
n_NA <- sub$from_n[1] - sum(n_1, n_2, n_3)
sample_vec <- sample( c( rep(NA, n_NA), # create randomised vector
rep(sub$to[1], n_1),
rep(sub$to[2], n_2),
rep(sub$to[3], n_3) ) )
train <- sample_vec[1:floor(length(sample_vec)*0.8)]
train_dt <- data.table(from = sub$from,
to = sub$to,
from_n = rep(length(train), 3),
to_n = c(length(train[which(train == sub$to[1])]),
length(train[which(train == sub$to[2])]),
length(train[which(train == sub$to[3])])
))
test <- sample_vec[(floor(length(sample_vec)*0.8)+1):length(sample_vec)]
test_dt <- data.table(from = sub$from,
to = sub$to,
from_n = rep(length(test), 3),
to_n = c(length(test[which(test == sub$to[1])]),
length(test[which(test == sub$to[2])]),
length(test[which(test == sub$to[3])])
))
test_list[[i]] <- test_dt
train_list[[i]] <- train_dt
}
However, the dataset I need to apply this to is very large, and this is too slow. Does anyone have any suggestions for how I could improve performance?
Thanks.