The following function will be used to pass all row_numbers
for each group in the data set and then draw a sample
without replacement and then drop all values that fall within the step size by using a combination of split
and findInterval
. The returned array will be used to slice
out the desired sample size with the desired sample step.
Modify sample_size
and sample_step
as needed to adjust the intial sample size and number of rows between retained samples
library(plyr)
sample_drop <- function(x, sample_size, sample_step=1) {
# draw sample and convert to list
lst_samp <- list(sort(sample(x, size=sample_size, replace=FALSE)))
# function to split last element of list by step size
split_last <- function(lst, step) {
lst_tail <- unlist(tail(lst, n=1L))
split(lst_tail, findInterval(lst_tail, c(0, step) + min(lst_tail)))
}
# split list until all values of last element fall within step size
while(do.call(function(x) max(x) - min(x), list(unlist(tail(lst_samp, n=1L)))) >= sample_step) {
lst_samp <- c(head(lst_samp, n=-1L), split_last(lst_samp, sample_step))
}
#lst_samp <- llply(lst_samp, unname) # for debug only to remove attr names
laply(lst_samp, min) # return minimum value from each element
}
Here is the function applied to the iris
dataset.
library(dplyr)
data("iris")
sample <- list()
sample$seed <- 1
sample$size <- 15L
sample$step <- 20L
# simulate sample draws with dropping and compare to iris results
set.seed(sample$seed)
sample_drop(50, sample$size, sample$step)
sample_drop(50, sample$size, sample$step)
sample_drop(50, sample$size, sample$step)
set.seed(sample$seed)
iris %>%
group_by(Species) %>%
mutate(gid=row_number()) %>%
slice(sample_drop(n(), sample$size, sample$step))
Here is the function applied to the larger diamonds
dataset
library(dplyr)
library(ggplot2)
data("diamonds")
sample <- list()
sample$seed <- 1
sample$size <- 1000L
sample$step <- 20L
set.seed(sample$seed)
diamonds %>%
group_by(cut) %>%
mutate(gid=row_number()) %>%
slice(sample_drop(n(), sample$size, sample$step))
set.seed(sample$seed)
diamonds %>%
group_by(cut) %>%
mutate(gid=row_number()) %>%
slice(sample_drop(n(), sample$size, sample$step)) %>%
summarise(samples=n())
There is likely room for improvement, but this is a lot easier for me to follow