0

I have example data as follows:

dat <- structure(list(
zipcode = c(1001, 1002, 1003, 1004, 1101, 1102, 1103, 1104, 1201, 1202, 1203, 1302), 
areacode = c(4, 4, NA, 4, 4, 4, NA, 1, 4, 4, NA, 4), 
type = structure(c(1L, 1L, NA, 1L, 2L, 2L, NA, 1L, 1L, 1L, NA, 1L), 
.Label = c("clay", "sand"), class = "factor"), 
region = c(3, 3, NA, 3, 3, 3, NA, 3, 3, 3, NA, 3), 
do_not_fill = c(1, NA, NA, 1, 1, NA, NA, 1, NA, NA, NA, 1)), 
class = c("data.table", "data.frame"), row.names = c(NA, -4L))

    zipcode areacode type region do_not_fill
 1:    1001        4 clay      3           1
 2:    1002        4 clay      3          NA
 3:    1003       NA <NA>     NA          NA
 4:    1004        4 clay      3           1
 5:    1101        4 sand      3           1
 6:    1102        4 sand      3          NA
 7:    1103       NA <NA>     NA          NA
 8:    1104        1 clay      3           1
 9:    1201        4 clay      3          NA
10:    1202        4 clay      3          NA
11:    1203       NA <NA>     NA          NA
12:    1302        4 clay      3           1

I want to fill ONLY the columns areacode, type and region based on two conditions.

  1. The areacode has to be the same before and after the NA.
  2. The first two digits of the zipcode have to be the same before and after the NA.

Based on this solution, and this solution, I attempted following (however data.table solutions are welcomed and even preferred):

library(dplyr)
dat |> 
  mutate(type = as.character(type)) |> 
  mutate(across(1:4,
                ~ ifelse(is.na(.) & lag(areacode) == lead(areacode) & 
                         lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
                         lag(.),
                         .)))

But somewhere I am doing something wrong, because I get:

Error:
! `n` and `row.names` must be consistent.
Run `rlang::last_error()` to see where the error occurred.

Desired output:

    zipcode areacode type region do_not_fill
 1:    1001        4 clay      3           1
 2:    1002        4 clay      3          NA
 3:    1003        4 clay      3          NA
 4:    1004        4 clay      3           1
 5:    1101        4 sand      3           1
 6:    1102        4 sand      3          NA
 7:    1103       NA <NA>     NA          NA
 8:    1104        1 clay      3           1
 9:    1201        4 clay      3          NA
10:    1202        4 clay      3          NA
11:    1203       NA <NA>     NA          NA
12:    1302        4 clay      3           1

EDIT

as_tibble(dat) |>
  mutate(type = as.character(areacode)) |> 
  mutate(across(1:4,
                ~ ifelse(is.na(.) & lag(areacode) == lead(areacode) & 
                           lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
                         lag(.),
                         .)))


# A tibble: 12 x 5
   zipcode areacode type  region do_not_fill
     <dbl>    <dbl> <chr>  <dbl>       <dbl>
 1    1001        4 4          3           1
 2    1002        4 4          3          NA
 3    1003        4 4          3          NA
 4    1004        4 4          3           1
 5    1101        4 4          3           1
 6    1102        4 4          3          NA
 7    1103       NA NA        NA          NA
 8    1104        1 1          3           1
 9    1201        4 4          3          NA
10    1202        4 4          3          NA
11    1203       NA NA        NA          NA
12    1302        4 4          3           1
Tom
  • 2,173
  • 1
  • 17
  • 44

2 Answers2

1

You need to convert it to a tibble first. I think this is because data.table has extra attributes

Have a look at the rownames,

rownames(as_tibble(dat))
 [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"
rownames(dat)
 [1] "1" "2" "3" "4"

as_tibble(dat) |>
  mutate(type = as.character(type)) |> 
  mutate(across(1:4,
                ~ ifelse(is.na(.) & lag(areacode) == lead(areacode) & 
                           lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
                         lag(.),
                         .)))

# A tibble: 12 x 5
   zipcode areacode type  region do_not_fill
     <dbl>    <dbl> <chr>  <dbl>       <dbl>
 1    1001        4 clay       3           1
 2    1002        4 clay       3          NA
 3    1003        4 clay       3          NA
 4    1004        4 clay       3           1
 5    1101        4 sand       3           1
 6    1102        4 sand       3          NA
 7    1103       NA NA        NA          NA
 8    1104        1 clay       3           1
 9    1201        4 clay       3          NA
10    1202        4 clay       3          NA
11    1203       NA NA        NA          NA
12    1302        4 clay       3           1
Sotos
  • 51,121
  • 6
  • 32
  • 66
  • 1
    Thank you very much Sotos. I notice that the code converts `type` to a number in my dataset. (i.e. `as.character(dat[1,3])` gives 1. I guess because that column was a factor. How do I prevent the code from doing this? NOTE: I am using `areacode` instead of `type`. – Tom Apr 21 '22 at 10:37
  • 1
    See also my edit now. – Tom Apr 21 '22 at 10:43
  • 1
    Fixed it! I changed the two values around – Tom Apr 21 '22 at 10:45
1

This can be done in data.table using the same code:

dat[, c(lapply(.SD, \(v) {fifelse(
  is.na(areacode) & lag(areacode) == lead(areacode) &
    lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))), lag(v), v)}), 
  .SD[, .(do_not_fill)]), .SDcols = !patterns("do_not_fill")]


    zipcode areacode   type region do_not_fill
      <num>    <num> <fctr>  <num>       <num>
 1:    1001        4   clay      3           1
 2:    1002        4   clay      3          NA
 3:    1004        4   clay      3          NA
 4:    1004        4   clay      3           1
 5:    1101        4   sand      3           1
 6:    1102        4   sand      3          NA
 7:    1103       NA   <NA>     NA          NA
 8:    1104        1   clay      3           1
 9:    1201        4   clay      3          NA
10:    1202        4   clay      3          NA
11:    1203       NA   <NA>     NA          NA
12:    1302        4   clay      3           1
MAR
  • 43
  • 6