37

I have a data frame in the format:

head(subset)
# ants  0 1 1 0 1 
# age   1 2 2 1 3
# lc    1 1 0 1 0

I need to create new data frame with random samples according to age and lc. For example I want 30 samples from age:1 and lc:1, 30 samples from age:1 and lc:0 etc.

I did look at random sampling method like;

newdata <- function(subset, age, 30)

But it is not the code that I want.

zx8754
  • 52,746
  • 12
  • 114
  • 209
user3525533
  • 405
  • 1
  • 5
  • 7

8 Answers8

53

I would suggest using either stratified from my "splitstackshape" package, or sample_n from the "dplyr" package:

## Sample data
set.seed(1)
n <- 1e4
d <- data.table(age = sample(1:5, n, T), 
                lc = rbinom(n, 1 , .5),
                ants = rbinom(n, 1, .7))
# table(d$age, d$lc)

For stratified, you basically specify the dataset, the stratifying columns, and an integer representing the size you want from each group OR a decimal representing the fraction you want returned (for example, .1 represents 10% from each group).

library(splitstackshape)
set.seed(1)
out <- stratified(d, c("age", "lc"), 30)
head(out)
#    age lc ants
# 1:   1  0    1
# 2:   1  0    0
# 3:   1  0    1
# 4:   1  0    1
# 5:   1  0    0
# 6:   1  0    1

table(out$age, out$lc)
#    
#      0  1
#   1 30 30
#   2 30 30
#   3 30 30
#   4 30 30
#   5 30 30

For sample_n you first create a grouped table (using group_by) and then specify the number of observations you want. If you wanted proportional sampling instead, you should use sample_frac.

library(dplyr)
set.seed(1)
out2 <- d %>%
  group_by(age, lc) %>%
  sample_n(30)

# table(out2$age, out2$lc)
A5C1D2H2I1M1N2O1R2T1
  • 190,393
  • 28
  • 405
  • 485
  • 2
    And if you need both the sampled and non-sampled sets be returned (as was my case when creating a treatment/control split in an A/B test) the `bothSets = T` argument of the `stratified` method is your friend. – Bill DeRose Jun 28 '19 at 20:32
  • Is it possible to get 100% for age=1, 30% for age=2, etc ? (vary the final sample per group) – Pablo Casas Apr 19 '20 at 23:40
  • 3
    As of 2020, the `sample_n` and `sample_frac` functions have been deprecated and replaced with `slice_sample`, which can be used after the `group_by` call like so: `slice_sample(n = 30` or `slice_sample(prop = .5)` – jjack Jul 07 '20 at 19:48
17

See the function strata from the package sampling. The function selects stratified simple random sampling and gives a sample as a result. Extra two columns are added - inclusion probabilities (Prob) and strata indicator (Stratum). See the example.

require(data.table)
require(sampling)

set.seed(1)
n <- 1e4
d <- data.table(age = sample(1:5, n, T), 
                lc = rbinom(n, 1 , .5),
                ants = rbinom(n, 1, .7))

# Sort
setkey(d, age, lc)

# Population size by strata
d[, .N, keyby = list(age, lc)]
#     age lc    N
#  1:   1  0 1010
#  2:   1  1 1002
#  3:   2  0  993
#  4:   2  1 1026
#  5:   3  0 1021
#  6:   3  1  982
#  7:   4  0  958
#  8:   4  1  940
#  9:   5  0 1012
# 10:   5  1 1056

# Select sample
set.seed(2)
s <- data.table(strata(d, c("age", "lc"), rep(30, 10), "srswor"))

# Sample size by strata
s[, .N, keyby = list(age, lc)]
#     age lc  N
#  1:   1  0 30
#  2:   1  1 30
#  3:   2  0 30
#  4:   2  1 30
#  5:   3  0 30
#  6:   3  1 30
#  7:   4  0 30
#  8:   4  1 30
#  9:   5  0 30
# 10:   5  1 30
Max Ghenis
  • 14,783
  • 16
  • 84
  • 132
djhurio
  • 5,437
  • 4
  • 27
  • 48
17

Here's some data:

set.seed(1)
n <- 1e4
d <- data.frame(age = sample(1:5,n,TRUE), 
                lc = rbinom(n,1,.5),
                ants = rbinom(n,1,.7))

You want a split-apply-combine strategy, where you split your data.frame (d in this example), sample rows/observations from each subsample, and then combine then back together with rbind. Here's how it works:

sp <- split(d, list(d$age, d$lc))
samples <- lapply(sp, function(x) x[sample(1:nrow(x), 30, FALSE),])
out <- do.call(rbind, samples)

The result:

> str(out)
'data.frame':   300 obs. of  3 variables:
 $ age : int  1 1 1 1 1 1 1 1 1 1 ...
 $ lc  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ ants: int  1 1 0 1 1 1 1 1 1 1 ...
> head(out)
         age lc ants
1.0.2242   1  0    1
1.0.4417   1  0    1
1.0.389    1  0    0
1.0.4578   1  0    1
1.0.8170   1  0    1
1.0.5606   1  0    1
Thomas
  • 43,637
  • 12
  • 109
  • 140
  • that's what I want to do. I did try split and lapply functions, but r gives an exceed memory error. Is there other way to do it for huge dataset? thanks – user3525533 May 07 '14 at 19:54
  • @user3525533 How big is your dataset? – Thomas May 07 '14 at 19:58
  • its around 2gb. I have 30 variables and 16826950 obs. in my data frame though. When I use split function, it gives a memory error. – user3525533 May 07 '14 at 21:17
  • 1
    Sorry, it's `setDT(d)[d[, sample(.I, 30L, FALSE), by="age,lc"]$V1]` – Arun May 08 '14 at 02:48
  • 1
    @Thomas;it will be faster to sample the row indices rather than rows - a lot faster if there are many columns / rows. `sp <- split(seq_len(nrow(d)), list(d$age, d$lc)) ; samples <- lapply(sp, sample, 30) ; d[unlist(samples), ]` – user20650 Aug 26 '15 at 11:38
7

Here's a one-liner using data.table:

set.seed(1)
n <- 1e4
d <- data.table(age  = sample(1:5, n, T),
                lc   = rbinom(n,   1, .5),
                ants = rbinom(n,   1, .7))

out <- d[, .SD[sample(1:.N, 30)], by=.(age, lc)]

# Check
out[, table(age, lc)]
##    lc
## age  0  1
##   1 30 30
##   2 30 30
##   3 30 30
##   4 30 30
##   5 30 30
mrbrich
  • 853
  • 1
  • 8
  • 9
3

This is ridiculously easy to do with base R.

Step 1: Create a stratum indicator using the interaction function.

Step 2: Use tapply on a sequence of row indicators to identify the indices of the random sample.

Step 3: Subset the data with those indices

Using the data example from @Thomas:

set.seed(1)
n <- 1e4
d <- data.frame(age = sample(1:5,n,TRUE), 
                lc = rbinom(n,1,.5),
                ants = rbinom(n,1,.7))

## stratum indicator
d$group <- interaction(d[, c('age', 'lc')])

## sample selection
indices <- tapply(1:nrow(d), d$group, sample, 30)

## obtain subsample
subsampd <- d[unlist(indices, use.names = FALSE), ]

Verify appropriate stratification

> table(subsampd$group)

1.0 2.0 3.0 4.0 5.0 1.1 2.1 3.1 4.1 5.1 
 30  30  30  30  30  30  30  30  30  30 
AdamO
  • 4,283
  • 1
  • 27
  • 39
1

Here is an updated dplyr version for stratified sampling when you need different numbers of samples from each group (i.e. 1:5 ratio or something in my case, but you can specify the n for each group combination).

set.seed(1)
n <- 1e4
d <- tibble::tibble(age = sample(1:5, n, T), 
                    lc = rbinom(n, 1 , .5),
                    ants = rbinom(n, 1, .7))
> d
# A tibble: 10,000 x 3
     age    lc  ants
   <int> <int> <int>
 1     2     0     1
 2     2     1     1
 3     3     1     1
 4     5     0     1
 5     2     0     1
 6     5     0     1
 7     5     1     1
 8     4     1     1
 9     4     1     1
10     1     0     1
# … with 9,990 more rows

there are 10 unique combos of age/lc:

> d %>% group_by(age, lc) %>% nest()
# A tibble: 10 x 3
# Groups:   age, lc [10]
     age    lc data                
   <int> <int> <list>              
 1     2     0 <tibble [993 × 1]>  
 2     2     1 <tibble [1,026 × 1]>
 3     3     1 <tibble [982 × 1]>  
 4     5     0 <tibble [1,012 × 1]>
 5     5     1 <tibble [1,056 × 1]>
 6     4     1 <tibble [940 × 1]>  
 7     1     0 <tibble [1,010 × 1]>
 8     1     1 <tibble [1,002 × 1]>
 9     4     0 <tibble [958 × 1]>  
10     3     0 <tibble [1,021 × 1]>

We can sample a prespecified number of rows from each group of age/lc combinations:

> d %>% 
  group_by(age, lc) %>% 
  nest() %>% 
  ungroup() %>% 
  # you must supply `n` for each combination of groups in `group_by(age, lc)`
  mutate(n = c(1, 1, 1, 2, 3, 1, 2, 3, 1, 1)) %>%  
  mutate(samp = purrr::map2(.x = data, .y= n, 
                            .f = function(.x, .y) slice_sample(.data = .x, n = .y))) %>% 
  select(-data, -n) %>% 
  unnest(samp)
# A tibble: 16 x 3
     age    lc  ants
   <int> <int> <int>
 1     2     0     0
 2     2     1     1
 3     3     1     1
 4     5     0     0
 5     5     0     1
 6     5     1     1
 7     5     1     1
 8     5     1     1
 9     4     1     1
10     1     0     1
11     1     0     1
12     1     1     1
13     1     1     1
14     1     1     0
15     4     0     1
16     3     0     1
Reilstein
  • 1,193
  • 2
  • 11
  • 25
0

A one-liner using my function fslice_sample(). .

The syntax is tidy-based but the grouping is done using a mix of collapse and data.table.

It's also a bit more flexible than dplyr::slice_sample().

# remotes::install_github("NicChr/timeplyr")
library(timeplyr)

fslice_sample(d, n = 30, .by = c(age, lc), seed = 1)
#> # A tibble: 300 x 3
#>      age    lc  ants
#>  * <int> <int> <int>
#>  1     1     0     1
#>  2     1     0     1
#>  3     1     0     0
#>  4     1     0     0
#>  5     1     0     0
#>  6     1     0     1
#>  7     1     0     1
#>  8     1     0     0
#>  9     1     0     0
#> 10     1     0     0
#> # ... with 290 more rows

Created on 2023-04-18 with reprex v2.0.2

NicChr
  • 858
  • 1
  • 9
0

I have used the following approach. Using sampling you can get the index for which set the sample is in. If you want to stratify this sampling, it is not possible using sampling() in R. Therefore I created a function, using the splitTools package :

library("splitTools")
stratified_sampling <- function(y_values, k_folds, set_seed = 54321) {
  index_array <- rep(0,length(y_values))
  CVfolds <- create_folds(y_values, k = k_folds, type = "stratified", seed=set_seed)
  for(i in 1:k_fold) {
    index_array[!c(1:length(y_values)) %in% CVfolds[[i]]] <- i
  }
  return(index_array)
}