Here's a solution involving newest addition to tidyverse
family rsample
by @topepo:
library(dplyr)
library(tidyr)
library(purrr)
library(rsample)
test_window <- 2
ordered_mtcars <- mtcars %>% tibble::rownames_to_column() %>%
# lets assume cyl represents group id, as in your example
rename(id=cyl) %>%
arrange(id) %>% group_by(id) %>%
# we will generate sequence id to mimic the "time" aspect of data
mutate(ordr=seq(n())) %>% ungroup()
We will use mtcars
dataset with small modifications to illustrate the sampling method
samples_df <- ordered_mtcars %>%
group_by(id) %>% nest() %>%
# we will generate a bunch of samples the size of target window.
# initial = 1 ensures that we have samples from every "time segment" of the data
# skip ensures samples are non-overlapping
mutate(idx=map(data, rolling_origin, initial=1, assess=test_window, skip=test_window-1)) %>%
# we are only interested in "testing" samples
unnest(idx) %>% mutate(r_test=map(splits, testing))
head(samples_df)
#> # A tibble: 6 x 4
#> id splits id1 r_test
#> <dbl> <list> <chr> <list>
#> 1 4 <S3: rsplit> Slice1 <tibble [2 x 12]>
#> 2 4 <S3: rsplit> Slice2 <tibble [2 x 12]>
#> 3 4 <S3: rsplit> Slice3 <tibble [2 x 12]>
#> 4 4 <S3: rsplit> Slice4 <tibble [2 x 12]>
#> 5 4 <S3: rsplit> Slice5 <tibble [2 x 12]>
#> 6 6 <S3: rsplit> Slice1 <tibble [2 x 12]>
Lets calculate which fraction of the samples_df is represented by 1 sampling window (by group). We will sample this many groups to make sure smalles of our groups has at least one sampling bunch.
frac <- samples_df %>% group_by(id) %>%
summarise(frac=1/n()) %>% pull(frac) %>% max
# here we are sampling bunches per group, so that we draw exactly 1 bunch from the smallest group
ordered_mtcars_test <- samples_df %>%
group_by(id) %>%
sample_frac(size = frac) %>%
unnest(r_test) %>%
arrange(id, ordr) %>% ungroup() %>%
select(names(ordered_mtcars))
ordered_mtcars_test
#> # A tibble: 10 x 13
#> rowname mpg id disp hp drat wt qsec vs am gear carb ordr
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 6
#> 2 Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 7
#> 3 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 10
#> 4 Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 11
#> 5 Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 6
#> 6 Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 7
#> 7 Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 8
#> 8 Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 9
#> 9 Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 12
#> 10 Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 13
Training portion of the data is just whatever did not end up in the test set
ordered_mtcars_train <- ordered_mtcars %>%
anti_join(ordered_mtcars_test, by=c("id", "ordr"))
ordered_mtcars_train
#> # A tibble: 22 x 13
#> rowname mpg id disp hp drat wt qsec vs am gear carb ordr
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 1
#> 2 Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 2
#> 3 Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 3
#> 4 Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> 5 Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 5
#> 6 Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 8
#> 7 Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 9
#> 8 Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 1
#> 9 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 2
#> 10 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 3