1
dx = data.frame(
  Day = c(3, 3, 4, 4, 5, 6, 7, 7, 7),
  Name = c("Amy", "Bob", "Jim", "Amy", "Bob", "Jim", "Amy", "Bob", "Jim"),
  Result = c(11:19)
)

   Day Name Result
   3  Amy     11
   3  Bob     12
   4  Jim     13
   4  Amy     14
   5  Bob     15
   6  Jim     16
   7  Amy     17
   7  Bob     18
   7  Jim     19

Here is a table that shows the trial results of 3 subjects on specific days. How can I create a summary table that shows the average latest-observed result before that day, (as a comparison for the upcoming results)? For example, the average latest-observed result for Day 6 would be Amy's Day 4, Bob's Day 5, Jim's Day 4. For Day 7, it would be Amy/4, Bob/5, Jim/6.

This is the solution I have but a for loop seems inefficient if I were to have thousands of days and subjects.

output = data.frame(Day = unique(dx$Day)) #Extract unique days
for (i in 1:nrow(output)) {
  dfTemp = dx[dx$Day < dx2[i, "Day"],] #Find all results prior to day
  dfTemp = dfTemp[with(dfTemp, order(Name, -Day)),] #Sort descending by day
  dfTemp = dfTemp[match(unique(dfTemp$Name), dfTemp$Name),] # Take only the 1st row for each person (will be latest result due to sorting above)
  output[i, "AvgLatestResult"] = mean((dfTemp[, "Result"])) #Find mean
}

 Day AvgLatestResult 
 3   NaN  
 4   11.5
 5   13.0
 6   14.0
 7   15.0
elai
  • 23
  • 6

3 Answers3

1

You may cast from long to wide (dcast), fill NA with last previous values (na.locf), take row means of all but first row (rowMeans(head(..., -1)..., pad with NA.

library(zoo)
library(data.table)

c(NA, rowMeans(head(na.locf(dcast(dx, Day ~ Name, value.var = "Result")[ , -1]), -1), na.rm = TRUE))
# [1]   NA 11.5 13.0 14.0 15.0

Or expand 'dx' with one row per 'Day' and 'Name' (CJ), fill missing values by 'Name' (na.locf), create a "day index", calculate mean of previous day.

setDT(dx)
d2 <- dx[CJ(Day = unique(dx$Day), Name = unique(dx$Name)), on = .(Day, Name)]
d2[ , Result2 := na.locf(Result, na.rm = FALSE), by = Name]
d2[ , ix := .GRP, by = Day]
d2[ , .(avg = mean(d2[ix == .GRP - 1, Result2], na.rm = TRUE)), by = Day]
#    Day  avg
# 1:   3  NaN
# 2:   4 11.5
# 3:   5 13.0
# 4:   6 14.0
# 5:   7 15.0
Henrik
  • 65,555
  • 14
  • 143
  • 159
1

Read dx creating a zoo object z splitting on Name to create a 5 x 3 wide form object with Days as rows and Names as columns. Then use na.locf to fill in NA values and rollapply with list(-1), meaning use prior value, and mean. This gives a zoo object AvgLatest which we can optionally convert to a data frame using fortify.zoo. (If a zoo object is OK as the result then omit the fortify.zoo line.)

library(zoo)

z <- read.zoo(dx, split = "Name")
z.fill <- na.locf(z, na.rm = FALSE)
AvgLatest <- rollapply(z.fill, list(-1), mean, na.rm = TRUE, by.column = FALSE, fill = NA)
fortify.zoo(AvgLatest, names = "Day")  # optional

giving:

  Day AvgLatest
1   3        NA
2   4      11.5
3   5      13.0
4   6      14.0
5   7      15.0

Variation

This code could be expressed using a magrittr pipeline like this:

library(zoo)
library(magrittr)

dx %>%
   read.zoo(split = "Name") %>%
   na.locf(na.rm = FALSE) %>%
   rollapply(list(-1), mean, na.rm = TRUE, by.column = FALSE, fill = NA) %>%
   fortify.zoo(names = "Day")  # optional
G. Grothendieck
  • 254,981
  • 17
  • 203
  • 341
0

For larger dataframes, this could be a bit faster:

# https://stackoverflow.com/questions/7735647/replacing-nas-with-latest-non-na-value
repeat.before = function(x) { 
  ind = which(!is.na(x))     
  if(is.na(x[1]))            
    ind = c(1,ind)       
  rep(x[ind], times = diff(   
    c(ind, length(x) + 1) )) 
}  

day_seq <- data.frame(Day = seq(min(dx$Day), max(dx$Day)))

out <- c(NA,
  rowMeans(
    do.call(
      cbind, by(dx, dx$Name, function(x) {
        out <- merge(x, day_seq, by.x = "Day", by.y = "Day", all.x = TRUE, all.y = TRUE)
        out$Name <- x$Name[1]
        out$Result <- repeat.before(out$Result)
        })),
    na.rm = TRUE))

out[-length(out)]

NA 11.5 13.0 14.0 15.0
erocoar
  • 5,723
  • 3
  • 23
  • 45