In case the order of the names does not change .*
could be inserted between the names and then use grep
in both directions (df1$names in df2$names and df2$names in df1$names) and combine them with or |
.
transform(df1, val = c("no", "yes")[1+(sapply(gsub(" +", ".*", df1$name),
\(x) any(grep(x, df2$name)), USE.NAMES = FALSE) |
Reduce(\(y, x) y | grepl(x, df1$name), gsub(" +", ".*", df2$name), FALSE))])
# name val
#1 JUAN GIRON yes
#2 GINA OLEAS no
#3 JUAN FERNANDO ELIZAGA yes
#4 MARCO TORRES no
#5 JUAN PABLO GONZALEZ yes
#6 IRMA GOMEZ yes
In case no partly match of the names are allowed surround the names with \b
.
transform(df1, val = c("no", "yes")[1+(sapply(
gsub(" *(\\b[^ ]+\\b)", ".*\\\\b\\1\\\\b", df1$name),
\(x) any(grep(x, df2$name)), USE.NAMES = FALSE) |
Reduce(\(y, x) y | grepl(x, df1$name),
gsub(" *(\\b[^ ]+\\b)", ".*\\\\b\\1\\\\b", df2$name), FALSE))])
In case the order can chaange make a positive look ahead by placing the name in (?=.*NAMME)
or also surround the name with \b
(?=.*\\bNAME\\b)
.
transform(df1, val = c("no", "yes")[1+(sapply(
gsub(" *(\\b[^ ]+\\b)", "(?=.*\\\\b\\1\\\\b)", df1$name),
\(x) any(grep(x, df2$name, perl=TRUE)), USE.NAMES = FALSE) |
Reduce(\(y, x) y | grepl(x, df1$name, perl=TRUE),
gsub(" *(\\b[^ ]+\\b)", "(?=.*\\\\b\\1\\\\b)", df2$name), FALSE))] )
Its also possible to use agrepl
and allow deletions which will be similar to the version assuming that the order of the names does not change and part matches of the name are allowed.
transform(df1, val = c("no", "yes")[1+(
sapply(df1$name, \(x) any(agrepl(x, df2$name,
list(cost=99, insertions=0, deletions=99, substitutions=0)))) |
Reduce(\(y, x) y | agrepl(x, df1$name, list(cost=99, insertions=0,
deletions=99, substitutions=0)), df2$name, FALSE))])
Another option can be the usage of look up tables:
s1 <- strsplit(df1$name, " ", TRUE)
lup1 <- list2env(split(rep(seq_along(s1), lengths(s1)), unlist(s1)))
s2 <- strsplit(df2$name, " ", TRUE)
lup2 <- list2env(split(rep(seq_along(s2), lengths(s2)), unlist(s2)))
`[<-`(sapply(s1, \(x) any(Reduce(intersect, mget(x, lup2, ifnotfound =
list(NULL))))), unlist(lapply(s2, \(x) Reduce(intersect, mget(x, lup1,
ifnotfound = list(NULL))))), TRUE)
#[1] TRUE FALSE TRUE FALSE TRUE TRUE
Benchmark:
Its also possible to limit the comparisons only to those which didn't have a match (GKi1b) where maybe the usage of indices using which
instead of using the logical vector twice could fuhrer improve and making an exit of the loop in case all have a hit. In case the names are not unique use unique
on the names.
library(dplyr)
bench::mark(
Thomas = df1 %>%
mutate(val = c("no", "yes")[1 + (rowSums(
outer(
strsplit(name, "\\s+"),
strsplit(df2$name, "\\s+"),
Vectorize(function(x, y) all(x %in% y) | all(y %in% x))
)
) > 0)]),
GKi1 = transform(df1, val = c("no", "yes")[1+(sapply(gsub(" +", ".*", df1$name),
\(x) any(grep(x, df2$name)), USE.NAMES = FALSE) |
Reduce(\(y, x) y | grepl(x, df1$name), gsub(" +", ".*", df2$name), FALSE))]),
GKi1b = transform(df1, val = c("no", "yes")[1 +
Reduce(\(i, x) `[<-`(i, !i, grepl(x, df1$name[!i])), gsub(" +", ".*",
df2$name), sapply(gsub(" +", ".*", df1$name), \(x) any(grep(x, df2$name)),
USE.NAMES = FALSE)) ]),
GKi1c = transform(df1, val = c("no", "yes")[1+(sapply(gsub(" +", ".*", df1$name),
\(x) any(grep(x, df2$name)), USE.NAMES = FALSE) |
grepl(paste(gsub(" +", ".*", df2$name), collapse = "|"), df1$name) )]),
GKi2 = transform(df1, val = c("no", "yes")[1+(sapply(
gsub(" *(\\b[^ ]+\\b)", ".*\\\\b\\1\\\\b", df1$name),
\(x) any(grep(x, df2$name)), USE.NAMES = FALSE) |
Reduce(\(y, x) y | grepl(x, df1$name),
gsub(" *(\\b[^ ]+\\b)", ".*\\\\b\\1\\\\b", df2$name), FALSE))]),
GKi3 = transform(df1, val = c("no", "yes")[1+(sapply(
gsub(" *(\\b[^ ]+\\b)", "(?=.*\\\\b\\1\\\\b)", df1$name),
\(x) any(grep(x, df2$name, perl=TRUE)), USE.NAMES = FALSE) |
Reduce(\(y, x) y | grepl(x, df1$name, perl=TRUE),
gsub(" *(\\b[^ ]+\\b)", "(?=.*\\\\b\\1\\\\b)", df2$name), FALSE))] ),
GKi4 = transform(df1, val = c("no", "yes")[1+(
sapply(df1$name, \(x) any(agrepl(x, df2$name,
list(cost=99, insertions=0, deletions=99, substitutions=0)))) |
Reduce(\(y, x) y | agrepl(x, df1$name, list(cost=99, insertions=0,
deletions=99, substitutions=0)), df2$name, FALSE))]),
GKi5 = {
s1 <- strsplit(df1$name, " ", TRUE)
lup1 <- list2env(split(rep(seq_along(s1), lengths(s1)), unlist(s1)))
s2 <- strsplit(df2$name, " ", TRUE)
lup2 <- list2env(split(rep(seq_along(s2), lengths(s2)), unlist(s2)))
transform(df1, val = c("no", "yes")[1+`[<-`(sapply(s1, \(x) any(Reduce(base::intersect, mget(x, lup2, ifnotfound =
list(NULL))))), unlist(lapply(s2, \(x) Reduce(base::intersect, mget(x, lup1,
ifnotfound = list(NULL))))), TRUE)])
}
)
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <bch:tm> <bch:> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
1 Thomas 863µs 894µs 919. 4.08KB 21.4 429 10 467ms
2 GKi1 211µs 218µs 3770. 0B 14.6 1803 7 478ms
3 GKi1b 211µs 226µs 3020. 0B 14.6 1448 7 479ms
4 GKi1c 183µs 200µs 3424. 0B 10.3 1667 5 487ms
5 GKi2 262µs 275µs 2755. 0B 12.4 1336 6 485ms
6 GKi3 391µs 409µs 2010. 0B 9.19 875 4 435ms
7 GKi4 374µs 386µs 2295. 0B 16.5 1110 8 484ms
8 GKi5 272µs 285µs 2570. 2.82KB 21.1 1220 10 475ms
All variants are more than 2 times faster than ThomasIsCoding using one CPU-core.