Below makes use of combinations
from the gtools
package as well as count
from the plyr
package.
library(gtools)
library(plyr)
PairWiseCombo <- function(df) {
myID <- df$ID
BreakDown <- rle(myID)
Unis <- BreakDown$values
numUnis <- BreakDown$lengths
Len <- length(Unis)
e <- cumsum(numUnis)
s <- c(1L, e + 1L)
## more efficient to generate outside of the "do.call(c, lapply(.."
## below. This allows me to reference a particular combination
## rather than re-generating the same combination multiple times
myCombs <- lapply(2:max(numUnis), function(x) combinations(x,2L))
tempDF <- plyr::count(do.call(c, lapply(1:Len, function(i) {
myRange <- s[i]:e[i]
combs <- myCombs[[numUnis[i]-1L]]
vapply(1:nrow(combs), function(j) paste(sort(df$Code[myRange[combs[j,]]]), collapse = ","), "A,D")
})))
names(tempDF) <- c("Code.Combinations", "Count.of.ID")
tempDF
}
Below are some metrics. I didn't test the solution by @Carl as it was giving different results than the other solutions.
set.seed(537)
ID <- do.call(c, lapply(1:100, function(x) rep(x, sample(2:26,1))))
temp <- rle(ID)
Code <- do.call(c, lapply(1:100, function(x) LETTERS[sample(temp$lengths[x])]))
TestDF <- data.frame(ID, Code, stringsAsFactors = FALSE)
system.time(t1 <- Noah(TestDF))
user system elapsed
97.05 0.31 97.42
system.time(t2 <- DTSolution(TestDF))
user system elapsed
0.43 0.00 0.42
system.time(t3 <- PairWiseCombo(TestDF))
user system elapsed
0.42 0.00 0.42
identical(sort(t3[,2]),sort(t2$IdCount))
TRUE
identical(sort(t3[,2]),sort(t1[,2]))
TRUE
Using microbenchmark
we have:
library(microbenchmark)
microbenchmark(Joseph = PairWiseCombo(TestDF), Psidom = DTSolution(TestDF), times = 10L)
Unit: milliseconds
expr min lq mean median uq max neval
Joseph 420.1090 433.9471 442.0133 446.4880 450.4420 452.7852 10
Psidom 396.8444 413.4933 416.3315 418.5573 420.9669 423.6303 10
Overall, the data.table
solution provided by @Psidom was the fastest (not surprisingly). Both my solution and the data.table
solution performed similarly on really large examples. However, the solution provided from @Noah is extremely memory intensive and couldn't be tested on larger data frames.
sessionInfo()
R version 3.3.0 (2016-05-03)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
Update
After tweaking @Carl's solution, the dplyr
approach is by far the fastest. Below is the code (you will see what parts I altered):
DPLYRSolution <- function(df) {
df <- df %>% full_join(df, by="ID") %>% group_by(Code.x,Code.y) %>% summarise(length(unique(ID))) %>% filter(Code.x!=Code.y)
## These two lines were added by me to remove "duplicate" rows
df <- mutate(df, Code=ifelse(Code.x < Code.y, paste(Code.x, Code.y), paste(Code.y, Code.x)))
df[which(!duplicated(df$Code)), ]
}
Below are the new metrics:
system.time(t4 <- DPLYRSolution(TestDF))
user system elapsed
0.03 0.00 0.03 ### Wow!!! really fast
microbenchmark(Joseph = PairWiseCombo(TestDF), Psidom = DTSolution(TestDF),
Carl = DPLYRSolution(TestDF), times = 10L)
Unit: milliseconds
expr min lq mean median uq max neval
Joseph 437.87235 442.7348 450.91085 452.77204 457.09465 461.85035 10
Psidom 407.81519 416.9444 422.62793 425.26041 429.02064 434.38881 10
Carl 44.33698 44.8066 48.39051 45.35073 54.06513 59.35653 10
## Equality Check
identical(sort(c(t4[,3])[[1]]), sort(t1[,2]))
[1] TRUE