2

I have the following dataframe in R.

 df <- data.frame(
  "DateValue" = c("2016-07-01", "2016-07-02", "2016-07-03", "2016-07-04", "2016-07-05", "2016-07-06","2017-07-01", "2017-07-02", "2017-07-03", "2017-07-04", "2017-07-05", "2017-07-06", "2018-07-01", "2018-07-02", "2018-07-03", "2018-07-04", "2018-07-05", "2018-07-06"),  
  "Age1" = seq(1:18),
  "Age2" = c(seq(14,36,2), rep(NA, 6)),
  "Age3" = c(seq(45,50),rep(NA, 12))
)

DateValue Age1 Age2 Age3
# 1  2016-07-01    1   14   45
# 2  2016-07-02    2   16   46
# 3  2016-07-03    3   18   47
# 4  2016-07-04    4   20   48
# 5  2016-07-05    5   22   49
# 6  2016-07-06    6   24   50
# 7  2017-07-01    7   26   NA
# 8  2017-07-02    8   28   NA
# 9  2017-07-03    9   30   NA
# 10 2017-07-04   10   32   NA
# 11 2017-07-05   11   34   NA
# 12 2017-07-06   12   36   NA
# 13 2018-07-01   13   NA   NA
# 14 2018-07-02   14   NA   NA
# 15 2018-07-03   15   NA   NA
# 16 2018-07-04   16   NA   NA
# 17 2018-07-05   17   NA   NA
# 18 2018-07-06   18   NA   NA

I am trying to come up with a code that aligns the data from the "Age2" and "Age3" columns so that the dates line up. Below is the output I am looking for:

df <- data.frame(
  "DateValue" = c("07-01", "07-02", "07-03", "07-04", "07-05", "07-06"), 
  "Age1" = seq(13:18),
  "Age2" = seq(26,36,2),
  "Age3" = seq(45,50)
) 

#  DateValue Age1 Age2 Age3
# 1     07-01   13   26   45
# 2     07-02   14   28   46
# 3     07-03   15   30   47
# 4     07-04   16   32   48
# 5     07-05   17   34   49
# 6     07-06   18   36   50

I am essentially keeping all the dates and values for my current year (2018) and matching them with the dates for the previous years. Note that I may have more dates in my previous year. But I need to drop all the rows that do not have any data for the current year. I reviewed the following thread on SO on rearranging the dataframe but the context is quite different than my situation. R Data Rearrange

I tried looking at the R reshape package but haven't had any luck. Any suggestions/ pointers would be appreciated.

ds_student
  • 183
  • 1
  • 2
  • 14
  • 2
    Your output looks like the first 6 rows in your original df. What exactly should be done? Please provide some explanation. – tmfmnk Aug 29 '19 at 20:33
  • 1
    I don't understand what you are asking. You seem to be just dropping rows where any value of age is missing. Is that what you intend? – MrFlick Aug 29 '19 at 20:33
  • I am sorry. Let me correct the desired output. – ds_student Aug 29 '19 at 20:37

3 Answers3

2

Here's a non-robust solution:

df$DateValue = format(as.Date(df$DateValue), '%m-%d')
Age3_non_NA <- sum(!is.na(df[['Age3']]))

df <- as.data.frame(lapply(df, function(l) tail(na.omit(l), Age3_non_NA)))
df
  DateValue Age1 Age2 Age3
1     07-01   13   26   45
2     07-02   14   28   46
3     07-03   15   30   47
4     07-04   16   32   48
5     07-05   17   34   49
6     07-06   18   36   50

And here's a more robust solution that includes gather and spread:

library(tidyr)
library(dplyr)
library(lubridate)

df%>%
  mutate(DateValue = as.Date(DateValue),
         Year = year(DateValue),
         Mon_Day = format(DateValue, '%m-%d'))%>%
  select(-DateValue)%>%
  gather(Age, val, -Year, -Mon_Day, na.rm = T)%>%
  group_by(Age, Mon_Day)%>%
  filter(Year == max(Year))%>%
  ungroup()%>%
  select(-Year)%>%
  spread(Age, val)

# A tibble: 6 x 4
  Mon_Day  Age1  Age2  Age3
  <chr>   <dbl> <dbl> <dbl>
1 07-01      13    26    45
2 07-02      14    28    46
3 07-03      15    30    47
4 07-04      16    32    48
5 07-05      17    34    49
6 07-06      18    36    50
Cole
  • 11,130
  • 1
  • 9
  • 24
1

Here's one way to do it. This could definitely be refactored, but it works.

library(dplyr)

# DateValue is a factor; convert to date format
df$DateValue <- as.Date(as.character(df$DateValue), format="%Y-%m-%d")

# grab the month and day from DateValue, sort by Age1
df <- df %>%
    mutate(MonthAndDay = format(DateValue, "%m-%d")) %>%
    arrange(desc(Age1))

# get vector of dates
dates <- df$MonthAndDay[which(!duplicated(df$MonthAndDay))]
# define age columns
agecols <- c("Age1","Age2","Age3")
# initialize empty df to be populated in loop
temp <- data.frame(MonthAndDay = dates)

# for each column, get values that a) are in the target dates, b) aren't NA, and c) only get the first ones (not duplicates--that's why we arranged by Age1 before). Select the values and add them as a new column to the new dataframe.
for (col in agecols) {
    temp_col <- filter(df, MonthAndDay %in% dates & !is.na(df[,col]))
    temp_col <- filter(temp_col[-which(duplicated(df$MonthAndDay)), ]) %>%
        select(col)
    temp[,col] <- temp_col
}
temp %>% arrange(MonthAndDay)

#   MonthAndDay Age1 Age2 Age3
# 1       07-01   13   26   45
# 2       07-02   14   28   46
# 3       07-03   15   30   47
# 4       07-04   16   32   48
# 5       07-05   17   34   49
# 6       07-06   18   36   50

heds1
  • 3,203
  • 2
  • 17
  • 32
1

Using base R, here is one way

#Get age columns    
age_cols <- grep("^Age", names(df))
#Convert date to actual object
df$DateValue <- as.Date(df$DateValue)
#Get year from date 
df$year <- as.integer(format(df$DateValue, "%Y"))
#Get month-date from Date
df$month_date <- format(df$DateValue, "%m-%d")

#Select dates which are present in max year
subset_date <- with(df, month_date[year == max(year)])

#For each age_cols select the non NA values which match subset_date
cbind.data.frame(DateValue = subset_date, 
   sapply(df[age_cols], function(x) {
     x <- x[order(df$year, decreasing = TRUE)]
     x <- x[!is.na(x)]
     x[match(subset_date, df$month_date)]
}))

#  DateValue Age1 Age2 Age3
#1     07-01   13   26   45
#2     07-02   14   28   46
#3     07-03   15   30   47
#4     07-04   16   32   48
#5     07-05   17   34   49
#6     07-06   18   36   50
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213