0

I have to replace the missing value to maximum (Value) by ID. How to do in R

ID Value
 1    NA
 5    15
 8    16
 6     8
 7    65
 8    NA
 5    25
 1    62
 6    14
 7    NA
 9    11
 8    12
 9    36
 1    26
 4    13
rafa.pereira
  • 13,251
  • 6
  • 71
  • 109
Praveen Kumar
  • 107
  • 1
  • 7
  • See these posts regarding grouping: [R Grouping functions: sapply vs. lapply vs. apply. vs. tapply vs. by vs. aggregate](http://stackoverflow.com/questions/3505701), [Aggregate multiple variables simultaneously](http://stackoverflow.com/questions/9723208), [How to sum a variable by group?](http://stackoverflow.com/q/1660124) – zx8754 Jun 20 '16 at 09:58
  • Welcome to Stack Overflow! Please read the info about [how to ask a good question](http://stackoverflow.com/help/how-to-ask) and how to give a [reproducible example](http://stackoverflow.com/questions/5963269). This will make it much easier for others to help you. – zx8754 Jun 20 '16 at 09:58
  • Possible duplicate of [R Grouping functions: sapply vs. lapply vs. apply. vs. tapply vs. by vs. aggregate](http://stackoverflow.com/questions/3505701/r-grouping-functions-sapply-vs-lapply-vs-apply-vs-tapply-vs-by-vs-aggrega) – zx8754 Jun 20 '16 at 09:59

4 Answers4

2

I would first precompute the max values using a call to aggregate(), and also precompute which rows of the data.frame have an NA value. Then you can match the IDs into the aggregation table to extract the corresponding max value.

maxes <- aggregate(Value~ID,df,max,na.rm=T);
nas <- which(is.na(df$Value));
df$Value[nas] <- maxes$Value[match(df$ID[nas],maxes$ID)];
df;
##    ID Value
## 1   1    62
## 2   5    15
## 3   8    16
## 4   6     8
## 5   7    65
## 6   8    16
## 7   5    25
## 8   1    62
## 9   6    14
## 10  7    65
## 11  9    11
## 12  8    12
## 13  9    36
## 14  1    26
## 15  4    13

Alternative, using ave():

df$Value <- ave(df$Value,df$ID,FUN=function(x) { x[is.na(x)] <- max(x,na.rm=T); x; });
df;
##    ID Value
## 1   1    62
## 2   5    15
## 3   8    16
## 4   6     8
## 5   7    65
## 6   8    16
## 7   5    25
## 8   1    62
## 9   6    14
## 10  7    65
## 11  9    11
## 12  8    12
## 13  9    36
## 14  1    26
## 15  4    13

Data

df <- data.frame(ID=c(1L,5L,8L,6L,7L,8L,5L,1L,6L,7L,9L,8L,9L,1L,4L),Value=c(NA,15L,16L,8L,
65L,NA,25L,62L,14L,NA,11L,12L,36L,26L,13L));

Benchmarking

Notes:

  • I had to modify bgoldst2(), rafa(), and akrun() to guard against the case of zero non-NAs in a group; otherwise, max(...,na.rm=T) returns -Inf which can mess up subsequent operations. I used the same algorithm for all three guards. thierry() and bgoldst1() did not have to be modified.
  • The large scale loop was fairly tricky to write, and I'm not going to attempt to explain all the details; feel free to ask follow-up questions if interested. Basically I ran 12 different benchmarks which varied by the number of groups and the frequency of NAs. The resulting table res shows the two parameters, the mean run-times for all solutions, and the unit chosen by the microbenchmark summarization algorithm.

library(microbenchmark);
library(dplyr);
library(data.table);
library(zoo);

thierry <- function(df) df %>% group_by(ID) %>% mutate(Value = ifelse(is.na(Value), max(Value, na.rm = TRUE), Value));
bgoldst1 <- function(df) { maxes <- aggregate(Value~ID,df,max,na.rm=T); nas <- which(is.na(df$Value)); df$Value[nas] <- maxes$Value[match(df$ID[nas],maxes$ID)]; df; };
bgoldst2 <- function(df) { df$Value <- ave(df$Value,df$ID,FUN=function(x) { nas <- is.na(x); if (any(!nas) && any(nas)) x[nas] <- max(x,na.rm=T); x; }); df; };
rafa <- function(dt) dt[ , Value := { nas <- is.na(Value); if (any(!nas) && any(nas)) ifelse( nas, max(Value, na.rm=T), Value) else Value; }, by = ID];
akrun <- function(dt) dt[, Value := { nas <- is.na(Value); if (any(!nas) && any(nas)) na.aggregate(Value, FUN = max) else Value; }, ID];

## small scale (OP's sample input)
df <- data.frame(ID=c(1L,5L,8L,6L,7L,8L,5L,1L,6L,7L,9L,8L,9L,1L,4L),Value=c(NA,15L,16L,8L,65L,NA,25L,62L,14L,NA,11L,12L,36L,26L,13L));
dt <- as.data.table(df);

ex <- as.data.frame(thierry(copy(df)));
identical(ex,bgoldst1(copy(df)));
identical(ex,bgoldst2(copy(df)));
identical(ex,as.data.frame(rafa(copy(dt))));
identical(ex,as.data.frame(akrun(copy(dt))));

microbenchmark(thierry(copy(df)),bgoldst1(copy(df)),bgoldst2(copy(df)),rafa(copy(dt)),akrun(copy(dt)));
## Unit: microseconds
##                expr      min        lq      mean   median       uq      max neval
##   thierry(copy(df))  955.804  989.1610 1043.2847 1004.984 1044.542 2852.016   100
##  bgoldst1(copy(df))  953.238 1005.1985 1069.6281 1039.410 1075.760 2968.337   100
##  bgoldst2(copy(df))  160.798  181.9665  196.0281  192.872  207.412  246.329   100
##      rafa(copy(dt))  947.679 1006.6945 1056.9396 1033.637 1055.874 2943.105   100
##     akrun(copy(dt)) 1327.862 1384.5255 1496.1259 1415.530 1445.894 3969.899   100

## large scale, 3 group sizes crossed with 4 NA densities
NV <- 1e5L;
NIs <- c(10L,1e3L,3e4L);
probNAs <- c(1e-3,0.05,0.4,0.95);
res <- expand.grid(NI=NIs,probNA=probNAs);
system.time({
    for (ri in seq_len(nrow(res))) {

        NI <- res$NI[ri];
        probNA <- res$probNA[ri];

        df <- data.frame(ID=sample(seq_len(NI),NV,T),Value=sample(c(NA,1:99),NV,T,c(probNA,rep((1-probNA)/99,99L))));
        dt <- as.data.table(df);

        ex <- as.data.frame(thierry(copy(df)));
        if (!all(c(
            identical(ex,bgoldst1(copy(df))),
            identical(ex,bgoldst2(copy(df))),
            identical(ex,as.data.frame(rafa(copy(dt)))),
            identical(ex,as.data.frame(akrun(copy(dt))))
        ))) stop('non-identical failure.');

        bm <- summary(microbenchmark(thierry(copy(df)),bgoldst1(copy(df)),bgoldst2(copy(df)),rafa(copy(dt)),akrun(copy(dt)),times=5L));

        nms <- sub('\\(.*','',as.character(bm$expr));
        for (nm in nms) if (!nm%in%names(res)) res[[nm]] <- NA_real_;
        if (!'unit'%in%names(res)) res$unit <- NA_character_;

        res[ri,nms] <- bm$mean;
        res$unit[ri] <- attr(bm,'unit');

    }; ## end for
});
##    user  system elapsed
##   73.18    0.00   73.37
res;
##       NI probNA    thierry  bgoldst1   bgoldst2       rafa       akrun         unit
## 1     10  0.001   7.850589 138.77128  14.867427   7.071150    8.023874 milliseconds
## 2   1000  0.001  40.318311 177.26223   9.868853   6.389129   18.054122 milliseconds
## 3  30000  0.001 813.204627 619.16166 125.274735  57.301590   74.732023 milliseconds
## 4     10  0.050   9.387743 139.41686  15.032158   8.479837    6.933616 milliseconds
## 5   1000  0.050  43.223697 156.79871  23.377797  20.550586  145.632279 milliseconds
## 6  30000  0.050 822.338773 677.81813 129.268155 114.585475  656.468438 milliseconds
## 7     10  0.400  15.955374 110.20717   9.785802  11.832889   10.511871 milliseconds
## 8   1000  0.400  55.858348 115.93900  14.441228  22.525058  142.740834 milliseconds
## 9  30000  0.400 853.571520 521.19690 147.925864 208.278328 2518.672465 milliseconds
## 10    10  0.950   9.768268  43.98346   5.921021   9.895623    8.571868 milliseconds
## 11  1000  0.950  49.228024  63.72596  13.702929  22.152230  143.606916 milliseconds
## 12 30000  0.950 822.033257 103.91700 113.398739  86.240922  630.982913 milliseconds
bgoldst
  • 34,190
  • 6
  • 38
  • 64
  • would you know why the following code doesn't work ? `library(data.table); setDT(df)[ , Value := ifelse( is.na(Value), max(Value), Value), by = ID]` – rafa.pereira Jun 20 '16 at 10:26
  • @rafa.pereira You have to add the `na.rm=T` argument to the `max()` call, otherwise any NA in the group will cause the return value of `max()` to be NA. – bgoldst Jun 20 '16 at 10:31
  • 1
    Oh! of course ! Thanks so much, again. I've posted an asnwer giving you credit. – rafa.pereira Jun 20 '16 at 10:35
  • Note that the copy() function is not required for the benchmarking. And it distorts the comparing a tiny bit. copy() takes about 3 microseconds. – Thierry Jun 20 '16 at 15:44
  • You can make the dplyr solution a bit faster by not using the pipes. `mutate(group_by(df, ID), Value = ifelse(is.na(Value), max(Value, na.rm = TRUE), Value))` gives me a speedup from 863 microseconds to 729 microseconds. – Thierry Jun 20 '16 at 15:45
  • @Thierry The `copy()` call is necessary for the data.table-based solutions, since they modify the data.table object `dt` in-place (by reference). Without a `copy()` call, the first execution of any data.table-based solution function (either `rafa()` or `akrun()` for this question) would replace all NAs in `dt`, and so none of the subsequent executions of those solution functions would be proper benchmarks of the test case under examination. And because the `copy()` call must be included for data.table-based solutions, for the sake of fairness, it must be included for *all* solutions. – bgoldst Jun 20 '16 at 16:42
1
library(dplyr)
dataset %>%
   group_by(ID) %>%
   mutate(
     Value = ifelse(
       is.na(Value), 
       max(Value, na.rm = TRUE), 
       Value
     )
   )
Thierry
  • 18,049
  • 5
  • 48
  • 66
  • Thanks @Thierry for this solution... I always prefer pipe functions for data cleaning and preparation... When I read the question, exactly same thing came to my mind as your answer... – Manoj Kumar Jun 20 '16 at 10:43
1

A simple and fast solution using data.table. Thanks @bgoldst for the tip of including na.rm=T.

library(data.table)

setDT(df)[ , Value := ifelse( is.na(Value), max(Value, na.rm=T), Value), by = ID]
rafa.pereira
  • 13,251
  • 6
  • 71
  • 109
1

We can use na.aggregate with data.table

library(data.table)
library(zoo)   
setDT(df)[, Value := na.aggregate(Value, FUN = max) , by = ID]
df
#    ID Value
# 1:  1    62
# 2:  5    15
# 3:  8    16
# 4:  6     8
# 5:  7    65
# 6:  8    16
# 7:  5    25
# 8:  1    62
# 9:  6    14
#10:  7    65
#11:  9    11
#12:  8    12
#13:  9    36
#14:  1    26
#15:  4    13
akrun
  • 874,273
  • 37
  • 540
  • 662