Unfortunately, dplyr does not allow you to "Sample down" large groups to a given size or just use all of the group's data if it's a small group - either you must sample everything to the smallest group size, or sample the smallest group with replacement to "inflate" it to a larger size. You can work around this by defining a custom sample_n function as follows though:
### Custom sampler function to sample min(data, sample) which can't be done with dplyr
### it's a modified copy of sample_n.grouped_df
sample_vals <- function (tbl, size, replace = FALSE, weight = NULL, .env = parent.frame())
{
#assert_that(is.numeric(size), length(size) == 1, size >= 0)
weight <- substitute(weight)
index <- attr(tbl, "indices")
sizes = sapply(index, function(z) min(length(z), size)) # here's my contribution
sampled <- lapply(1:length(index), function(i) dplyr:::sample_group(index[[i]], frac = FALSE, tbl = tbl,
size = sizes[i], replace = replace, weight = weight, .env = .env))
idx <- unlist(sampled) + 1
grouped_df(tbl[idx, , drop = FALSE], vars = groups(tbl))
}
samped_data = dataset %>% group_by(something) %>% sample_vals(size = 50000) %>% ungroup()