Below, I have a base R
solution that is much faster than myfun
provided by the OP.
myDistOne <- function(m) {
v1 <- m[,1L]; v2 <- m[,2L]
rs <- rowSums(m)
lapply(seq_along(rs), function(x) {
t1 <- which(abs(rs[x] - rs) == 1)
t2 <- t1[which(abs(v1[x] - v1[t1]) <= 1)]
t2[which(abs(v2[x] - v2[t2]) <= 1)]
})
}
Here are some benchmarks:
library(microbenchmark)
set.seed(9711)
m1 <- matrix(sample(50, 2000, replace = TRUE), ncol = 2) ## 1,000 rows
microbenchmark(myfun(m1), myDistOne(m1))
Unit: milliseconds
expr min lq mean median uq max neval cld
myfun(m1) 78.61637 78.61637 80.47931 80.47931 82.34225 82.34225 2 b
myDistOne(m1) 27.34810 27.34810 28.18758 28.18758 29.02707 29.02707 2 a
identical(myfun(m1), myDistOne(m1))
[1] TRUE
m2 <- matrix(sample(200, 20000, replace = TRUE), ncol = 2) ## 10,000 rows
microbenchmark(myfun(m2), myDistOne(m2))
Unit: seconds
expr min lq mean median uq max neval cld
myfun(m2) 5.219318 5.533835 5.758671 5.714263 5.914672 7.290701 100 b
myDistOne(m2) 1.230721 1.366208 1.433403 1.419413 1.473783 1.879530 100 a
identical(myfun(m2), myDistOne(m2))
[1] TRUE
Here is a very large example:
m3 <- matrix(sample(1000, 100000, replace = TRUE), ncol = 2) ## 50,000 rows
system.time(testJoe <- myDistOne(m3))
user system elapsed
26.963 10.988 37.973
system.time(testUser <- myfun(m3))
user system elapsed
148.444 33.297 182.639
identical(testJoe, testUser)
[1] TRUE
I'm sure there is a faster solution. Maybe by sorting the rowSums upfront and working from there could see an improvement (it could also get very messy).
Update
As I predicted, working from a sorted rowSums is much faster (and uglier!)
myDistOneFast <- function(m) {
v1 <- m[,1L]; v2 <- m[,2L]
origrs <- rowSums(m)
mySort <- order(origrs)
rs <- origrs[mySort]
myDiff <- c(0L, diff(rs))
brks <- which(myDiff > 0L)
lenB <- length(brks)
n <- nrow(m)
myL <- vector("list", length = n)
findRows <- function(v, s, r, u1, u2) {
lapply(v, function(x) {
sx <- s[x]
tv1 <- s[r]
tv2 <- tv1[which(abs(u1[sx] - u1[tv1]) <= 1)]
tv2[which(abs(u2[sx] - u2[tv2]) <= 1)]
})
}
t1 <- brks[1L]; t2 <- brks[2L]
## setting first index in myL
myL[mySort[1L:(t1-1L)]] <- findRows(1L:(t1-1L), mySort, t1:(t2-1L), v1, v2)
k <- t0 <- 1L
while (k < (lenB-1L)) {
t1 <- brks[k]; t2 <- brks[k+1L]; t3 <- brks[k+2L]
vec <- t1:(t2-1L)
if (myDiff[t1] == 1L) {
if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, c(t0:(t1-1L), t2:(t3-1L)), v1, v2)
} else {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
} else if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t2:(t3-1L), v1, v2)
}
if (myDiff[t2] > 1L) {
if (myDiff[t3] > 1L) {
k <- k+2L; t0 <- t2
} else {
k <- k+1L; t0 <- t1
}
} else {k <- k+1L; t0 <- t1}
}
## setting second to last index in myL
if (k == lenB-1L) {
t1 <- brks[k]; t2 <- brks[k+1L]; t3 <- n+1L; vec <- t1:(t2-1L)
if (myDiff[t1] == 1L) {
if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, c(t0:(t1-1L), t2:(t3-1L)), v1, v2)
} else {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
} else if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t2:(t3-1L), v1, v2)
}
k <- k+1L; t0 <- t1
}
t1 <- brks[k]; vec <- t1:n
if (myDiff[t1] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
myL
}
The results are not even close. myDistOneFast
is over 100x faster than the OP's original myfun
on very large matrices and also scales well. Below are some benchmarks:
microbenchmark(OP = myfun(m1), Joe = myDistOne(m1), JoeFast = myDistOneFast(m1))
Unit: milliseconds
expr min lq mean median uq max neval
OP 57.60683 59.51508 62.91059 60.63064 61.87141 109.39386 100
Joe 22.00127 23.11457 24.35363 23.87073 24.87484 58.98532 100
JoeFast 11.27834 11.99201 12.59896 12.43352 13.08253 15.35676 100
microbenchmark(OP = myfun(m2), Joe = myDistOne(m2), JoeFast = myDistOneFast(m2))
Unit: milliseconds
expr min lq mean median uq max neval
OP 4461.8201 4527.5780 4592.0409 4573.8673 4633.9278 4867.5244 100
Joe 1287.0222 1316.5586 1339.3653 1331.2534 1352.3134 1524.2521 100
JoeFast 128.4243 134.0409 138.7518 136.3929 141.3046 172.2499 100
system.time(testJoeFast <- myDistOneFast(m3))
user system elapsed
0.68 0.00 0.69 ### myfun took over 100s!!!
To test equality, we have to sort each vector of indices. We also can't use identical
for comparison as myL
is initialized as an empty list, thus some of the indices contain NULL
values (these correspond to integer(0)
in the result from myfun
and myDistOne
).
testJoeFast <- lapply(testJoeFast, sort)
all(sapply(1:50000, function(x) all(testJoe[[x]]==testJoeFast[[x]])))
[1] TRUE
unlist(testJoe[which(sapply(testJoeFast, is.null))])
integer(0)
Here is an example with 500,000 rows:
set.seed(42)
m4 <- matrix(sample(2000, 1000000, replace = TRUE), ncol = 2)
system.time(myDistOneFast(m4))
user system elapsed
10.84 0.06 10.94
Here is an overview of how the algorithm works:
- Calculate rowSums
- Order the rowSums (i.e. returns the indices from the original vector of the sorted vector)
- Call diff
- Mark each non-zero instance
- Determine which indices in small range satisfy the OP's request
- Use the ordered vector calculated in
2
to determine original index
This is much faster than comparing one rowSum to all of the rowSum every time.