You can use gregexp
with ^>.*|.{1,2}
to match either anything starting with >
or split by length 2 and use regmatches
to extract the matches.
unlist(regmatches(df$V1, gregexpr("^>.*|.{1,2}", df$V1)))
# [1] ">A1_[Er]" "aa" "aa" "bb" "bc" "cc"
# [7] "c" ">B2_[Br]" "dd" "dd" "ee" "ee"
#[13] "ef" "f" ">C3_[Gh]" "gg" "gg" "gg"
#[19] "gh" "hh" "hh" "ii" "ii" "ij"
#[25] "jj" "jj" "j"
Or use strsplit
with (?<=..)
to split a string into a fixed number of characters and use [<-
(or replace
) to insert the strings starting with >
.
i <- which(startsWith(df$V1, ">"))
unlist(`[<-`(strsplit(`[<-`(df$V1, i, ""), "(?<=..)", perl=TRUE), i, df$V1[i]))
# [1] ">A1_[Er]" "aa" "aa" "bb" "bc" "cc"
# [7] "c" ">B2_[Br]" "dd" "dd" "ee" "ee"
#[13] "ef" "f" ">C3_[Gh]" "gg" "gg" "gg"
#[19] "gh" "hh" "hh" "ii" "ii" "ij"
#[25] "jj" "jj" "j"
Or using lapply
.
unlist(lapply(df$V1, \(x)
if(startsWith(x, ">")) x else strsplit(x, "(?<=..)", perl=TRUE)))
# [1] ">A1_[Er]" "aa" "aa" "bb" "bc" "cc"
# [7] "c" ">B2_[Br]" "dd" "dd" "ee" "ee"
#[13] "ef" "f" ">C3_[Gh]" "gg" "gg" "gg"
#[19] "gh" "hh" "hh" "ii" "ii" "ij"
#[25] "jj" "jj" "j"
Splitting on [a-z]
.
unlist(strsplit(df$V1, "(?<=[a-z]{2})", perl=TRUE))
# [1] ">A1_[Er]" "aa" "aa" "bb" "bc" "cc"
# [7] "c" ">B2_[Br]" "dd" "dd" "ee" "ee"
#[13] "ef" "f" ">C3_[Gh]" "gg" "gg" "gg"
#[19] "gh" "hh" "hh" "ii" "ii" "ij"
#[25] "jj" "jj" "j"
Benchmark
library(dplyr) #For akrun and Chris Ruehlemann
library(tidyr)
library(stringr)
bench::mark(
gregexpr = unlist(regmatches(df$V1, gregexpr("^>.*|.{1,2}", df$V1))),
strsplit = {i <- which(startsWith(df$V1, ">"))
unlist(`[<-`(strsplit(`[<-`(df$V1, i, ""), "(?<=..)", perl=TRUE), i, df$V1[i]))},
lapply = {unlist(lapply(df$V1, \(x)
if(startsWith(x, ">")) x else strsplit(x, "(?<=..)", perl=TRUE)))},
strsplitAZ = unlist(strsplit(df$V1, "(?<=[a-z]{2})", perl=TRUE)), #Splitting [a-z] instead of excluding
ThomasIsCoding = {with(df, unlist( lapply(V1, function(x) {
if (startsWith(x, ">")) {x
} else { regmatches(x, gregexpr("\\w{1,2}", x)) } }))) },
"Chris Ruehlemann" = {df %>% #Splitting [a-z] instead of excluding starting with >
mutate(V1 = ifelse(str_detect(V1, "[a-z]{2,}"),
str_extract_all(V1, "..?"),
V1)) %>%
unnest_longer(V1) %>% .$V1},
akrun = {df %>% #Splitting [a-z] instead of excluding starting with >
mutate(V1 = str_replace_all(V1, "([a-z]{2})", "\\1,")) %>%
separate_longer_delim(V1, delim = ",") %>% .$V1}
)
Result
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
1 gregexpr 42.38µs 46.06µs 20692. 149.8KB 32.5 9546 15
2 strsplit 12.34µs 13.81µs 71254. 264B 14.3 9998 2
3 lapply 20.39µs 21.83µs 45172. 19KB 13.6 9997 3
4 strsplitAZ 10.16µs 10.86µs 91203. 264B 9.12 9999 1
5 ThomasIsCoding 85.22µs 91.28µs 10651. 29.8KB 21.3 5004 10
6 Chris Ruehlemann 3.16ms 3.22ms 308. 3.6MB 19.9 139 9
7 akrun 2.58ms 2.62ms 378. 254.1KB 19.8 172 9
In this case using strsplit
splitting on [a-z] is the fastest followed by strsplit
excluding thise starting with >
. Both use lowest amount of memory of compared methods.