1

I would like to subset a dataframe (dat) in 2 steps. First constraint: keep unique "id". Second constraint: keep the smallest "visit". For example, for id=S2, I want to keep the 3rd row which has visit #1, rather than row2 with visit #2.

set.seed(42)  ## for sake of reproducibility
n <- 6
dat <- data.frame(id=c("s1","s2","s2","s3","s4","s4"), 
                  date=seq.Date(as.Date("2020-12-26"), as.Date("2020-12-31"), "day"),
                  visit=1:2,
                  age=sample(18:30, n, replace=TRUE))

#dat
# id       date visit age
# 1 s1 2020-12-26     1  18
# 2 s2 2020-12-27     2  22
# 3 s2 2020-12-28     1  18
# 4 s3 2020-12-29     2  26
# 5 s4 2020-12-30     1  27
# 6 s4 2020-12-31     2  21

#desired output:
# id       date visit age 
# 1 s1 2020-12-26     1  18    
# 3 s2 2020-12-28     1  18    
# 4 s3 2020-12-29     2  26    
# 5 s4 2020-12-30     1  27    
frankieb
  • 41
  • 2

2 Answers2

3

base R

dat[ave(dat$visit, dat$id, FUN = function(z) seq_along(z) == which.min(z)) > 0,]
#   id       date visit age
# 1 s1 2020-12-26     1  18
# 3 s2 2020-12-28     1  18
# 4 s3 2020-12-29     2  26
# 5 s4 2020-12-30     1  27

dplyr

library(dplyr)
dat %>%
  group_by(id) %>%
  slice(which.min(visit)) %>%
  ungroup()
# # A tibble: 4 x 4
#   id    date       visit   age
#   <chr> <date>     <int> <int>
# 1 s1    2020-12-26     1    18
# 2 s2    2020-12-28     1    18
# 3 s3    2020-12-29     2    26
# 4 s4    2020-12-30     1    27

data.table

library(data.table)
as.data.table(dat)[, .SD[which.min(visit),], by = id]
#        id       date visit   age
#    <char>     <Date> <int> <int>
# 1:     s1 2020-12-26     1    18
# 2:     s2 2020-12-28     1    18
# 3:     s3 2020-12-29     2    26
# 4:     s4 2020-12-30     1    27
r2evans
  • 141,215
  • 6
  • 77
  • 149
0

This seems to be the solution with the simplest syntax:

library(dplyr)

dat %>%
  group_by(id) %>%
  filter(age == min(age))
denisafonin
  • 1,116
  • 1
  • 7
  • 16