CustID <- c(seq(1,10,1),seq(1,5,1))
EndDate <- c(Sys.Date(),rep(seq(Sys.Date(),Sys.Date()+6, 1),2))
# Let's assume you're starting with a list
data <- list(CustID, EndDate)
The list looks like this:
[[1]]
[1] 1 2 3 4 5 6 7 8 9 10 1 2 3 4 5
[[2]]
[1] "2016-09-06" "2016-09-06" "2016-09-07" "2016-09-08" "2016-09-09" "2016-09-10" "2016-09-11" "2016-09-12" "2016-09-06"
[10] "2016-09-07" "2016-09-08" "2016-09-09" "2016-09-10" "2016-09-11" "2016-09-12"
# To make matching CustID and EndDate easy let's change it to a DF
df1 <- as.data.frame(data)
colnames(df1) <- c("CustID", "EndDate")
The data.frame looks like this:
CustID EndDate
1 1 2016-09-06
2 2 2016-09-06
3 3 2016-09-07
4 4 2016-09-08
5 5 2016-09-09
6 6 2016-09-10
7 7 2016-09-11
8 8 2016-09-12
9 9 2016-09-06
10 10 2016-09-07
11 1 2016-09-08
12 2 2016-09-09
13 3 2016-09-10
14 4 2016-09-11
15 5 2016-09-12
# Find duplicated CustID
dupID <- duplicated(df1$CustID)
dupdf <- df1[df1$CustID %in% df1$CustID[dupID],]
# Remove the entry with the oldest EndDate for each ID
res <- data.frame(CustID=NA, EndDate = as.Date(NA))
for(i in unique(dupdf$CustID)){
tmp <- dupdf[dupdf$CustID == i, ]
res <- rbind(res,tmp[!tmp$EndDate == min(tmp$EndDate),])
}
res <- res[!is.na(res$EndDate),]
The result (res
) has the repeated customer ID's (custID
) with the oldest EndDate
removed for each ID:
CustID EndDate
11 1 2016-09-08
12 2 2016-09-09
13 3 2016-09-10
14 4 2016-09-11
15 5 2016-09-12
If you want a vectorized solution you might use data.table
:
require(data.table)
dupdf <- data.table(dupdf)
dupdf[,.(
EndDate = max(EndDate)
), by = CustID]
A suggestion from the comments is
data <- as.data.frame(data)
subset(data, as.logical(ave(as.numeric(EndDate), CustID, FUN = function(x) {
length(x) == 1L | x != min(x)
})))