Use a combination of dplyr's slice_sample()
and purrr.
I'm assuming you want to do the sensible thing and sample a constant proportion of the total.
library(tidyverse)
iris %>%
group_by(Species) %>%
slice_sample(prop = 0.8)
#> # A tibble: 120 × 5
#> # Groups: Species [3]
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 4.5 2.3 1.3 0.3 setosa
#> 2 4.8 3 1.4 0.1 setosa
#> 3 5 3.4 1.5 0.2 setosa
#> 4 5.1 3.4 1.5 0.2 setosa
#> 5 5.2 4.1 1.5 0.1 setosa
#> 6 5.2 3.4 1.4 0.2 setosa
#> 7 5.1 3.8 1.5 0.3 setosa
#> 8 4.8 3.4 1.6 0.2 setosa
#> 9 4.3 3 1.1 0.1 setosa
#> 10 5.5 3.5 1.3 0.2 setosa
#> # … with 110 more rows
Then, use purrr to get the following:
library(tidyverse)
map_dfr(
seq_len(3),
~ iris %>%
group_by(Species) %>%
slice_sample(prop = 0.8)
)
#> # A tibble: 360 × 5
#> # Groups: Species [3]
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 5 3.5 1.3 0.3 setosa
#> 2 5.8 4 1.2 0.2 setosa
#> 3 4.9 3.1 1.5 0.1 setosa
#> 4 5.2 3.5 1.5 0.2 setosa
#> 5 5.4 3.4 1.5 0.4 setosa
#> 6 4.6 3.1 1.5 0.2 setosa
#> 7 5.3 3.7 1.5 0.2 setosa
#> 8 4.4 3 1.3 0.2 setosa
#> 9 4.3 3 1.1 0.1 setosa
#> 10 5.1 3.7 1.5 0.4 setosa
#> # … with 350 more rows
Edit 1
If you must specify by size, @r2evans's answer to this post yields the following possible solution:
library(tidyverse)
group_sizes <- tibble(
Species = c("setosa", "versicolor", "virginica"),
size = c(12, 30, 33)
)
map_dfr(
seq_len(3),
~ iris %>%
left_join(group_sizes, by = "Species") %>%
group_by(Species) %>%
mutate(samp = sample(n())) %>%
filter(samp <= size) %>%
select(-c(size, samp)) %>%
ungroup()
)
#> # A tibble: 225 × 5
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 4.9 3 1.4 0.2 setosa
#> 2 5.4 3.9 1.7 0.4 setosa
#> 3 4.9 3.1 1.5 0.1 setosa
#> 4 4.3 3 1.1 0.1 setosa
#> 5 5.4 3.9 1.3 0.4 setosa
#> 6 5.1 3.5 1.4 0.3 setosa
#> 7 5 3 1.6 0.2 setosa
#> 8 5.5 4.2 1.4 0.2 setosa
#> 9 4.4 3 1.3 0.2 setosa
#> 10 5 3.5 1.3 0.3 setosa
#> # … with 215 more rows
If you want random sizes each time you can do the following:
library(tidyverse)
map_dfr(
seq_len(3),
~ iris %>%
left_join(
iris %>%
count(Species) %>%
rowwise() %>%
mutate(size = sample(1:n, 1)),
by = "Species"
) %>%
group_by(Species) %>%
mutate(samp = sample(n())) %>%
filter(samp <= size) %>%
select(-c(size, samp, n)) %>%
ungroup()
)
#> # A tibble: 246 × 5
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5 3.6 1.4 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
#> 7 4.6 3.4 1.4 0.3 setosa
#> 8 4.4 2.9 1.4 0.2 setosa
#> 9 4.9 3.1 1.5 0.1 setosa
#> 10 4.8 3.4 1.6 0.2 setosa
#> # … with 236 more rows