The hint in your last comment that you are looking for a function to find synonyms was helpful to provide an answer. The following answer is certainly not the most elegant and also not the fastest. If you have a large number of synonyms to find, you might have a look at solutions with data.table
, which is the fastest package for such lookup tasks. However, to get you going, I will only use base R. (Please note, I have rewritten your data, since the structure of your df did not make sense to me, I hope that my understanding of your data is correct).
UPDATE: Introduced additional options based on search in collapsed rows, one with 'stringi' package, and a data.table
appraoch. Also added a "string7"
that will not be matched. Furthermore, introduced benchmarks using microbenchmark
. My data.table
approach seems not the best, furthermore, the package comes with certain overhead for small data sets, only with larger data sets its advantage in speed will be evident. You might provide a larger reproducible example, maybe based on the functions used in this question link for a better comparison. However, it should be noted, that data.table may only be faster for exact matching, check ?%like%
. The other functions I tested would have to be adapted for a fair comparison, maybe also using the package fmatch
.
strings <- c("string1", "string2", "string3", "string4", "string5", "string6", "string7")
df <- read.table(text = "col1 col2 col3
string1 x x
string2 string4 x
string3 string5 string6"
,stringsAsFactors = F,header =T)
find_synonyms <- function(df, strings) {
sapply(strings, function(x) {
synonyms <- apply(df, 1, function(y) {
#you could also use match()
#grep() allows partial matching if needed
if(any(grepl(paste0("^",x,"$"), y))) {
y[1]
} else {
NA
}
})
synonyms[!(is.na(synonyms))]
})
}
find_synonyms_collapse_rows_grepl <- function(df, strings) {
synsets <- apply(df, 1, paste, collapse = " ")
names(synsets) <- df[,1]
sapply(strings, function(x) {
names(synsets)[grep(paste0("\\b", x ,"\\b"), synsets, perl=T)]
})
}
library(stringi)
find_synonyms_collapse_rows_stringi <- function(df, strings) {
synsets <- apply(df, 1, paste, collapse = " ")
names(synsets) <- df[,1]
sapply(strings, function(x) {
names(synsets)[stri_detect_regex(synsets, paste0("\\b", x ,"\\b"))]
})
}
library(data.table)
find_synonyms_DT_reshape_like <- function(df, strings) {
df <- as.data.table(df)
df[ , mainsynonym := col1]
df <- melt(df, id.vars = "mainsynonym")
setkey(df, value)
sapply(strings, function(x) {
df[value %like% x, mainsynonym]
})
}
find_synonyms_DT_matchkey <- function(df, strings) {
df <- as.data.table(df)
df[ , mainsynonym := col1]
df <- melt(df, id.vars = "mainsynonym")
setkey(df, value)
sapply(strings, function(x) {
df[value == x , mainsynonym]
})
}
results_list <- list(unlist(find_synonyms(df, strings)),
unlist(find_synonyms_collapse_rows_grepl(df, strings)),
unlist(find_synonyms_collapse_rows_stringi(df, strings)),
unlist(find_synonyms_DT_reshape_like(df, strings)),
unlist(find_synonyms_DT_matchkey(df, strings))
)
sapply(results_list, function(x) {
sapply(results_list, function(y) {
identical(x,y)
})
}
)
# [,1] [,2] [,3] [,4] [,5]
# [1,] TRUE TRUE TRUE TRUE TRUE
# [2,] TRUE TRUE TRUE TRUE TRUE
# [3,] TRUE TRUE TRUE TRUE TRUE
# [4,] TRUE TRUE TRUE TRUE TRUE
# [5,] TRUE TRUE TRUE TRUE TRUE
library(microbenchmark)
microbenchmark(
find_synonyms(df, strings),
find_synonyms_collapse_rows_grepl(df, strings),
find_synonyms_collapse_rows_stringi(df, strings),
find_synonyms_DT_reshape_like(df, strings),
find_synonyms_DT_matchkey(df, strings)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# find_synonyms(df, strings) 719.624 848.5085 1129.4298 987.6565 1058.9080 9290.361 100
# find_synonyms_collapse_rows_grepl(df, strings) 660.017 738.1770 952.7571 794.8230 839.4295 16998.577 100
# find_synonyms_collapse_rows_stringi(df, strings) 223.428 265.8625 364.4979 302.9685 344.4170 5798.433 100
# find_synonyms3_DT_reshep_like(df, strings) 3259.029 3643.9060 3900.1955 3800.8180 4102.7985 5883.303 100
# find_synonyms_DT_matchkey(df, strings) 4710.135 4907.9040 5428.8650 5279.5595 5630.8855 8450.769 100