I would like to gapfill NA values for each group by sampling non NA values from the same group.
This is the closest to what I'd like to achieve using !is.na()
Ignoring values or NAs in the sample function.
> dput(data)
structure(list(len = c(NA, 45447.4157838775, 161037.71538108,
78147.8550470324, 7193.48815617057, 1571.95459212405, 18191.381972185,
20366.2132412031, 10014.987524596, 1403.72511829297, 5651.17842991513,
6848.03271105711, 8043.32937011393, 8926.65133418451, 5808.44456603825,
2208.14264175252, 1797.4936747033, 5325.76651327694, 2660.66730207955,
5844.07912541444, 3956.40473896271, 959.873314407621, 3294.01472360025,
5221.94864001864, 3781.51913857335, 7811.83819953768, 3387.20323328623,
5514.92099458441, 5792.54371531706, 5643.98385143961, 15478.916809379,
8401.66533205217, 7046.25074819247, 2734.73639821402, NA, 62332.3343404513,
NA, 46563.1214718113, 25590.4020105238, 13015.3682275862, 4984.80432801441,
NA), point = c(NA, 0, 8, 5, 2, 0, 9, 0, 0, 0, 3, 1, 0, 6, 1,
1, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, NA,
10, NA, 19, 6, 5, 0, NA), country = structure(c(1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L,
3L, 2L, 2L, 2L, 2L, 1L), .Label = c("WCY_____ES", "WCY_____FR",
"WCY_____IT"), class = "factor"), group = c(1L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L)), row.names = c(NA, -42L), class = "data.frame")
library(dplyr)
data1 <- data %>%
group_by(group) %>%
mutate(nulen = if_else(country == 'WCY_____FR', len, sample(len[!is.na(len)], 1, TRUE)),
nupoint = if_else(country == 'WCY_____FR', point, sample(point[!is.na(point)], 1, TRUE)))
But instead I get Error in sample.int(length(x), size, replace, prob) :
invalid first argument
There should be no significant difference between the known and gapfilled distributions. If there are no values to sample from the same group (either other values are NA
or there is only one row in the ```group``) then the sample should be taken from the entire dataset. Any package is fine.