data
has a column named description
of type character()
and a column id
of type integer()
that is set by row_number()
.
data_map
has a column name desc_map
of type character()
and a column id
of type integer()
that is set by row_number()
.
data
and data_map
do have other columns used in further processing after joining.
The idea of the code below is to use data_map$desc_map
as a pattern in str_detect
to match data$description
. On a match it would add a row to another tibble
, using data$id
and data_map$id
. The resulting matches
allows joining together of data
and data_map
.
library(tidyverse)
data = tribble(
~description,
"19ABB123456",
"19BCC123456",
"19CDD123456",
"19DEE123456",
"19EFF456789",
"19FF0056789",
"19A0A123456",
) %>% mutate(id = row_number())
data_map = tribble(
~desc_map,
"AA",
"BB",
"CC",
"DD",
"EE",
"FF",
"00",
) %>% mutate(id = row_number())
seq_along_rows <- function(.data) {
seq_len(nrow(.data))
}
matches <- data %>% (function (tbl) {
m <- tibble(
row_id = integer(),
map_id = integer()
)
for (i in seq_along_rows(tbl)) {
row <- tbl[i, ]
key <- row[["description"]]
found <- FALSE
for (j in seq_along_rows(data_map)) {
map_row <- data_map[j, ]
pattern <- map_row[["desc_map"]]
if (str_detect(key, pattern)) {
m <- add_row(m, row_id = row[["id"]], map_id = map_row[["id"]])
found <- TRUE
# allow for finding more than one match
}
}
if (!found) {
m <- add_row(m, row_id = row[["id"]], map_id = NA)
}
}
return(m)
})
not_unique <- matches %>%
group_by(row_id) %>%
filter(n() > 1) %>%
ungroup() %>%
inner_join(data, by = c("row_id" = "id")) %>%
inner_join(data_map, by = c("map_id" = "id"))
head(not_unique)
#> # A tibble: 2 x 4
#> row_id map_id description desc_map
#> <int> <int> <chr> <chr>
#> 1 6 6 19FF0056789 FF
#> 2 6 7 19FF0056789 00
matches_not_found <- matches %>%
filter(is.na(map_id)) %>%
select(-map_id) %>%
inner_join(data, by = c("row_id" = "id"))
head(matches_not_found)
#> # A tibble: 1 x 2
#> row_id description
#> <int> <chr>
#> 1 7 19A0A123456
matches_found <- matches %>%
filter(!is.na(map_id)) %>%
inner_join(data, by = c("row_id" = "id")) %>%
inner_join(data_map, by = c("map_id" = "id"))
head(matches_found)
#> # A tibble: 6 x 4
#> row_id map_id description desc_map
#> <int> <int> <chr> <chr>
#> 1 1 2 19ABB123456 BB
#> 2 2 3 19BCC123456 CC
#> 3 3 4 19CDD123456 DD
#> 4 4 5 19DEE123456 EE
#> 5 5 6 19EFF456789 FF
#> 6 6 6 19FF0056789 FF
My question is, can this code be written in a more tidy
functional way and what that would look like? If it cannot be done in such a way, what would the reason be?