It is quicker to do vectorised replacement rather than than apply a function to every row, particularly as the size of the data increases. As you want to match on the first word, you can use setNames()
to create a named vector of patterns and replacements. You can then do vectorised replacement with stringr::str_replace_all()
:
df$name <- stringr::str_replace_all(df$name, setNames(name_w_degree, gsub("\\s.+", "", name_w_degree)))
df
# name value
# 1 Julie (Dr) 1
# 2 Helen (MD) 2
# 3 Faye 3
# 4 Faye 4
# 5 Helen (MD) 5
Benchmarks
It doesn't make much difference with small data frames but as they grow this method becomes relatively much quicker than non-vectorised approaches. The performance is similar to the answer by @ThomasIsCoding until around 800k rows, at which point that approach becomes significantly faster than this one. Both are much faster than approaches which are not vectorised (e.g. using sapply()
or map()
), presumably because of the overhead of calling a function many times.

Benchmark code
n <- c(1, 10, 100, 1e3, 1e4, 1e5, 1e6)
results <- bench::press(
n = n,
{
# replicate df n times
big_df <- do.call(rbind, replicate(n, df, simplify = FALSE))
bench::mark(
min_iterations = 1,
max_iterations = 100,
check = FALSE,
rowwise = {
big_df %>%
rowwise() %>%
mutate(name = name_w_degree[grepl(name, name_w_degree)])
},
base_sapply = {
sapply(big_df$name, function(x) {
name_w_degree[which(grepl(x, name_w_degree))]
})
},
purrr_map_chr = {
big_df %>%
mutate(name = map_chr(name, ~ grep(pattern = ., x = name_w_degree, value = TRUE)))
},
stringr_replace_all = {
stringr::str_replace_all(big_df$name, setNames(name_w_degree, gsub("\\s.+", "", name_w_degree)))
},
base_transform = {
transform(big_df, name = name_w_degree[match(name, sub("\\W+.*", "", name_w_degree))])
}
)
}
)
Code to generate plot
library(ggplot2)
results |>
transmute(
expression = attr(expression, "description"),
n = n * 5,
median
) |>
ggplot(aes(x = n, y = median, group = expression)) +
geom_line(aes(color = expression), size = 1) +
geom_point(aes(color = expression), size = 2) +
scale_x_log10(n.breaks = length(n)) +
theme_bw() +
theme(
legend.position = "bottom"
) +
labs(
title = "Comparison of results",
x = "Number of rows",
y = "Median time to run (seconds)"
)