You can consider
vapply
and
- parallization:
parallel::parApply
With your probabilities
matrix:
set.seed(1010) #reproducibility
#create a matrix of probabilities
#three possible outcomes, 10.000 cases
probabilities <- matrix(runif(10000*3), nrow=10000,ncol=3)
probabilities <- probabilities / Matrix::rowSums(probabilities)
classification <- apply(probabilities, 1, function(x) sample(1:3, 1, prob = x))
vapply
By specifying the class for FUN.VALUE
, you might be able to make it fast.
classification2 <- vapply(split(probabilities, 1:nrow(probabilities)),
function(x) sample(1:3, 1, prob = x),
FUN.VALUE = integer(1), USE.NAMES = FALSE)
head(classification2)
#> [1] 1 3 3 1 2 3
parallel package
benchmarkme::get_cpu()
#> $vendor_id
#> [1] "GenuineIntel"
#>
#> $model_name
#> [1] "Intel(R) Core(TM) i5-4288U CPU @ 2.60GHz"
#>
#> $no_of_cores
#> [1] 4
In the above environment,
cl <- parallel::makeCluster(4)
doParallel::registerDoParallel(cl, cores = 4)
parApply()
can do what apply()
do.
classification3 <- parallel::parApply(cl, probabilities, 1, function(x) sample(1:3, 1, prob = x))
head(classification3)
#> [1] 2 2 2 2 3 3
Comparing the three, including apply()
solution,
microbenchmark::microbenchmark(
question = { # yours
apply(probabilities, 1, function(x) sample(1:3, 1, prob = x))
},
vapp = {
vapply(split(probabilities, 1:nrow(probabilities)), function(x) sample(1:3, 1, prob = x), FUN.VALUE = integer(1), USE.NAMES = FALSE)
},
parr = {
parallel::parApply(cl, probabilities, 1, function(x) sample(1:3, 1, prob = x))
}
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> question 49.93853 58.39965 65.05360 62.98119 68.28044 182.03267 100
#> vapp 44.19828 54.84294 59.47109 58.56739 62.05269 146.14792 100
#> parr 43.33227 48.16840 53.26599 50.87995 54.17286 98.67692 100
parallel::stopCluster(cl)