Benchmarking
The reproducible example was given by @BluVoxe. We compare the output and the running times of @BluVoxe's and both @zx8754's solutions :
library(dplyr)
library(purrr)
library(microbenchmark)
# Make this reproducible
set.seed(1)
# Define and inspect a test dataset
data <- tibble(
Sp_1 = sample(letters[1:5], 10, replace = TRUE),
Sp_2 = sample(letters[1:5], 10, replace = TRUE)
)
data
# # A tibble: 10 x 2
# Sp_1 Sp_2
# <chr> <chr>
# 1 a e
# 2 d e
# 3 a b
# 4 b b
# 5 e a
# 6 c e
# 7 b e
# 8 c a
# 9 c a
# 10 a e
#First solution
data1 <- data%>%
# Add a unique representation of `Sp_1` and `Sp_2` where order doesn't matter
dplyr::rowwise() %>%
dplyr::mutate(string = paste(sort(c(Sp_1, Sp_2)), collapse = "")) %>%
dplyr::ungroup() %>%
# Use `map_int()` to get an integer `id` representation of `string`
dplyr::mutate(id = purrr::map_int(string, ~which(unique(string) == .)))
data1
# # A tibble: 10 x 4
# Sp_1 Sp_2 string id
# <chr> <chr> <chr> <int>
# 1 a e ae 1
# 2 d e de 2
# 3 a b ab 3
# 4 b b bb 4
# 5 e a ae 1
# 6 c e ce 5
# 7 b e be 6
# 8 c a ac 7
# 9 c a ac 7
# 10 a e ae 1
# Second solution
data2 <- data %>%
dplyr::mutate(id1 = paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)),
id2 = as.integer(as.factor(id1)))
data2
# # A tibble: 10 x 4
# Sp_1 Sp_2 id1 id2
# <chr> <chr> <chr> <int>
# 1 a e ae 3
# 2 d e de 7
# 3 a b ab 1
# 4 b b bb 4
# 5 e a ae 3
# 6 c e ce 6
# 7 b e be 5
# 8 c a ac 2
# 9 c a ac 2
# 10 a e ae 3
# Third solution
data3 <- transform(
data,
id = as.integer(
as.factor(
paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2))
)
)
)
data3
# Sp_1 Sp_2 id
# 1 a e 3
# 2 d e 7
# 3 a b 1
# 4 b b 4
# 5 e a 3
# 6 c e 6
# 7 b e 5
# 8 c a 2
# 9 c a 2
# 10 a e 3
#Compare efficiency :
microbenchmark::microbenchmark(
x1 = {
data%>%
dplyr::rowwise() %>%
dplyr::mutate(string = paste(sort(c(Sp_1, Sp_2)), collapse = "")) %>%
dplyr::ungroup() %>%
dplyr::mutate(id = purrr::map_int(string, ~which(unique(string) == .)))
},
x2 = {
data %>%
dplyr::mutate(id = as.integer(as.factor(
paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
},
x3 = {
transform(data,
id = as.integer(as.factor(
paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
},
unit = "relative")
#Unit: relative
#expr min lq mean median uq max neval
# x1 23.329340 24.151001 23.951911 23.710270 22.996736 28.23673 100
#x2 8.064332 7.785381 8.214726 7.796895 7.741803 19.18936 100
#x3 1.000000 1.000000 1.000000 1.000000 1.000000 1.00000 100
#With bigger data :
set.seed(1)
data <- tibble(
Sp_1 = sample(letters[1:5], 10000, replace = TRUE),
Sp_2 = sample(letters[1:5], 10000, replace = TRUE)
)
microbenchmark::microbenchmark(
x1 = {
data%>%
dplyr::rowwise() %>%
dplyr::mutate(string = paste(sort(c(Sp_1, Sp_2)), collapse = "")) %>%
dplyr::ungroup() %>%
dplyr::mutate(id = purrr::map_int(string, ~which(unique(string) == .)))
},
x2 = {
data %>%
dplyr::mutate(id = as.integer(as.factor(
paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
},
x3 = {
transform(data,
id = as.integer(as.factor(
paste0(pmin(Sp_1, Sp_2), pmax(Sp_1, Sp_2)))))
}, unit = "relative")
#Unit: relative
#expr min lq mean median uq max neval
#x1 524.626924 512.590748 506.051098 515.687843 521.642359 418.635195 100
#x2 1.503782 1.514021 1.577941 1.559449 1.620967 1.648478 100
#x3 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100
The last solution is the most efficient.
However there are differences : the first and the lat solutions render ID in the same order as they appear in the dataset, while the second solution renders ID in the alphabetical order of the couple.