R data imputation from group_by table based on count

Question

group = c(1,1,4,4,4,5,5,6,1,4,6,1,1,1,1,6,4,4,4,4,1,4,5,6)
animal = c('a','b','c','c','d','a','b','c','b','d','c','a','a','a','a','c','c','c','c','c','a','c','a','c')
sleep = c('y','n','y','y','y','n','n','y','n','y','n','y','y','n','m','y','n','n','n','n',NA, NA, NA, NA)

test = data.frame(group, animal, sleep)
print(test)

group_animal = test %>% group_by(`group`, `animal`) %>% count(sleep)
print(group_animal)

I would like to replace the NA values in the test df's sleep column by the highest count of sleep answer based on group and animal.

Such that Group 1, Animal a with NAs in the sleep column should have a sleep value of 'y' because that is the value with the highest count among Group 1 Animal a.

Group 4 animal c with NAs for sleep should have 'n' as the sleep value as well.

TarJae · Answer 1 · 2022-07-26T18:34:46.430

Update now with group_by(group, animal) thnx @Quinten, removed prior answer:

group by animal
use replace_na with the replace argument as sleep[n==max(n)]
new: in case of ties like in group 5 add !is.na(sleep) to avoid conflicts:

library(dplyr)
library(tidyr)

group_animal %>%
    group_by(group, animal) %>% 
    arrange(desc(sleep), .by_group = TRUE) %>% 
    mutate(sleep = replace_na(sleep, sleep[n==max(n) & !is.na(sleep)]))

  group animal sleep     n
   <dbl> <chr>  <chr> <int>
 1     1 a      y         3
 2     1 a      n         1
 3     1 a      m         1
 4     1 a      y         1
 5     1 b      n         2
 6     4 c      y         2
 7     4 c      n         4
 8     4 c      n         1
 9     4 d      y         2
10     5 a      n         1
11     5 a      n         1
12     5 b      n         1
13     6 c      y         2
14     6 c      n         1
15     6 c      y         1

I think the OP wants to do `group_by(group, animal)` on test df right? — Quinten, Jul 26 '22 at 18:25

score 3 · Accepted Answer · answered Jul 26 '22 at 18:13

Another option is replacing the NAs with the Mode. You can use the Mode function from this post in the na.aggregate function from zoo to replace these NAs like this:

Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

group = c(1,1,4,4,4,5,5,6,1,4,6,1,1,1,1,6,4,4,4,4,1,4,5,6)
animal = c('a','b','c','c','d','a','b','c','b','d','c','a','a','a','a','c','c','c','c','c','a','c','a','c')
sleep = c('y','n','y','y','y','n','n','y','n','y','n','y','y','n','m','y','n','n','n','n',NA, NA, NA, NA)
test = data.frame(group, animal, sleep)

library(dplyr)
library(zoo)
test %>%
  group_by(group, animal) %>%
  mutate(sleep = na.aggregate(sleep , FUN=Mode)) %>%
  ungroup()
#> # A tibble: 24 × 3
#>    group animal sleep
#>    <dbl> <chr>  <chr>
#>  1     1 a      y    
#>  2     1 b      n    
#>  3     4 c      y    
#>  4     4 c      y    
#>  5     4 d      y    
#>  6     5 a      n    
#>  7     5 b      n    
#>  8     6 c      y    
#>  9     1 b      n    
#> 10     4 d      y    
#> # … with 14 more rows
#> # ℹ Use `print(n = ...)` to see more rows

^{Created on 2022-07-26 by the reprex package (v2.0.1)}

Here is tail of output:

> tail(test)
# A tibble: 6 × 3
  group animal sleep
  <dbl> <chr>  <chr>
1     4 c      n    
2     4 c      n    
3     1 a      y    
4     4 c      n    
5     5 a      n    
6     6 c      y

score 1 · Answer 3 · 2022-07-26T18:21:26.030

Try this.

This method essential creates a custom column to coalesce with sleep, it subsets sleep based on the max count values obtained from str_count

library(dplyr)

test |> 
  group_by(group, animal) |> 
  mutate(sleep = coalesce(sleep, sleep[max(stringr::str_count(paste(sleep, collapse = ""), pattern = sleep), na.rm = TRUE)])) |>
  ungroup()

   group animal sleep
1      1      a     y
2      1      b     n
3      4      c     y
4      4      c     y
5      4      d     y
6      5      a     n
7      5      b     n
8      6      c     y
9      1      b     n
10     4      d     y
11     6      c     n
12     1      a     y
13     1      a     y
14     1      a     n
15     1      a     m
16     6      c     y
17     4      c     n
18     4      c     n
19     4      c     n
20     4      c     n
21     1      a     y
22     4      c     n
23     5      a     n
24     6      c     n

R data imputation from group_by table based on count

3 Answers3