Split string of characters contained in a row of a data frame by a fixed number of characters and store the resultant fragments in subsequent rows

Question

I have the following data frame:

df <- data.frame(V1 = c(">A1_[Er]", 
                        "aaaabbbcccc", 
                        ">B2_[Br]", 
                        "ddddeeeeeff", 
                        ">C3_[Gh]", 
                        "ggggggghhhhhiiiiijjjjjj"))

I want to split the strings by the fixed number of characters (two for the purpose of this particular question) and place them in new rows. I also want to exclude the rows containing strings starting with ">" sign. The resultant data frame should look like this:

df1 <- data.frame(V1 = c(">A1_[Er]", "aa", "aa", "bb", "bc", "cc", "c", 
                         ">B2_[Br]", "dd", "dd", "ee", "ee", "ef", "f",
                         ">C3_[Gh]", "gg", "gg", "gg", "gh", "hh", "hh", "ii", "ii", "ij", "jj", "jj", "jj"))

I have tried using separate_longer_position() function on a subseted df like this:

separate_longer_position(subset(df, !df$V1 %like% ">"), V1, 2)

My approach did indeed chop up the desired strings, but also left the rows containing the strings starting with ">" out from the resultant data frame.

On a side note, this is indeed a FASTA format, but for educationl purposes, I dont want to use dedicated packages like Biostrings to solve this.

Please advise.

See also: [Chopping a string into a vector of fixed width character elements](https://stackoverflow.com/questions/2247045/) — GKi, Mar 28 '23 at 12:09
Is it also fine to split only `[a-z]` instead of excluding those strings starting with `>`? — GKi, Apr 05 '23 at 08:51

score 3 · Accepted Answer · answered Mar 28 '23 at 11:41

You can try regmatches

df1 <-
  data.frame(V1 = with(
    df,
    unlist(
      lapply(
        V1,
        function(x) {
          if (startsWith(x, ">")) {
            x
          } else {
            regmatches(x, gregexpr("\\w{1,2}", x))
          }
        }
      )
    )
  ))

and obtain

> df1
         V1
1  >A1_[Er]
2        aa
3        aa
4        bb
5        bc
6        cc
7         c
8  >B2_[Br]
9        dd
10       dd
11       ee
12       ee
13       ef
14        f
15 >C3_[Gh]
16       gg
17       gg
18       gg
19       gh
20       hh
21       hh
22       ii
23       ii
24       ij
25       jj
26       jj
27        j

GKi · Answer 2 · 2023-04-05T09:23:36.897

You can use gregexp with ^>.*|.{1,2} to match either anything starting with > or split by length 2 and use regmatches to extract the matches.

unlist(regmatches(df$V1, gregexpr("^>.*|.{1,2}", df$V1)))
# [1] ">A1_[Er]" "aa"       "aa"       "bb"       "bc"       "cc"      
# [7] "c"        ">B2_[Br]" "dd"       "dd"       "ee"       "ee"      
#[13] "ef"       "f"        ">C3_[Gh]" "gg"       "gg"       "gg"      
#[19] "gh"       "hh"       "hh"       "ii"       "ii"       "ij"      
#[25] "jj"       "jj"       "j"

Or use strsplit with (?<=..) to split a string into a fixed number of characters and use [<- (or replace) to insert the strings starting with >.

i <- which(startsWith(df$V1, ">"))
unlist(`[<-`(strsplit(`[<-`(df$V1, i, ""), "(?<=..)", perl=TRUE), i, df$V1[i]))
# [1] ">A1_[Er]" "aa"       "aa"       "bb"       "bc"       "cc"      
# [7] "c"        ">B2_[Br]" "dd"       "dd"       "ee"       "ee"      
#[13] "ef"       "f"        ">C3_[Gh]" "gg"       "gg"       "gg"      
#[19] "gh"       "hh"       "hh"       "ii"       "ii"       "ij"      
#[25] "jj"       "jj"       "j"

Or using lapply.

unlist(lapply(df$V1, \(x)
       if(startsWith(x, ">")) x else strsplit(x, "(?<=..)", perl=TRUE)))
# [1] ">A1_[Er]" "aa"       "aa"       "bb"       "bc"       "cc"      
# [7] "c"        ">B2_[Br]" "dd"       "dd"       "ee"       "ee"      
#[13] "ef"       "f"        ">C3_[Gh]" "gg"       "gg"       "gg"      
#[19] "gh"       "hh"       "hh"       "ii"       "ii"       "ij"      
#[25] "jj"       "jj"       "j"

Splitting on [a-z].

unlist(strsplit(df$V1, "(?<=[a-z]{2})", perl=TRUE))
# [1] ">A1_[Er]" "aa"       "aa"       "bb"       "bc"       "cc"      
# [7] "c"        ">B2_[Br]" "dd"       "dd"       "ee"       "ee"      
#[13] "ef"       "f"        ">C3_[Gh]" "gg"       "gg"       "gg"      
#[19] "gh"       "hh"       "hh"       "ii"       "ii"       "ij"      
#[25] "jj"       "jj"       "j"

Benchmark

library(dplyr)  #For akrun and Chris Ruehlemann
library(tidyr)
library(stringr)

bench::mark(
gregexpr = unlist(regmatches(df$V1, gregexpr("^>.*|.{1,2}", df$V1))),
strsplit = {i <- which(startsWith(df$V1, ">"))
  unlist(`[<-`(strsplit(`[<-`(df$V1, i, ""), "(?<=..)", perl=TRUE), i, df$V1[i]))},
lapply = {unlist(lapply(df$V1, \(x)
  if(startsWith(x, ">")) x else strsplit(x, "(?<=..)", perl=TRUE)))},
strsplitAZ = unlist(strsplit(df$V1, "(?<=[a-z]{2})", perl=TRUE)), #Splitting [a-z] instead of excluding
ThomasIsCoding = {with(df, unlist( lapply(V1, function(x) {
          if (startsWith(x, ">")) {x
          } else { regmatches(x, gregexpr("\\w{1,2}", x)) } }))) },
"Chris Ruehlemann" = {df %>%  #Splitting [a-z] instead of excluding starting with >
  mutate(V1 = ifelse(str_detect(V1, "[a-z]{2,}"),
                     str_extract_all(V1, "..?"),
                     V1)) %>%
  unnest_longer(V1) %>% .$V1},
akrun = {df %>%  #Splitting [a-z] instead of excluding starting with >
  mutate(V1 = str_replace_all(V1, "([a-z]{2})", "\\1,")) %>% 
  separate_longer_delim(V1, delim = ",")  %>% .$V1}
)

Result

  expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
  <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
1 gregexpr          42.38µs  46.06µs    20692.   149.8KB    32.5   9546    15
2 strsplit          12.34µs  13.81µs    71254.      264B    14.3   9998     2
3 lapply            20.39µs  21.83µs    45172.      19KB    13.6   9997     3
4 strsplitAZ        10.16µs  10.86µs    91203.      264B     9.12  9999     1
5 ThomasIsCoding    85.22µs  91.28µs    10651.    29.8KB    21.3   5004    10
6 Chris Ruehlemann   3.16ms   3.22ms      308.     3.6MB    19.9    139     9
7 akrun              2.58ms   2.62ms      378.   254.1KB    19.8    172     9

In this case using strsplit splitting on [a-z] is the fastest followed by strsplit excluding thise starting with >. Both use lowest amount of memory of compared methods.

Chris Ruehlemann · Answer 3 · 2023-03-28T15:39:44.307

Although I may be late to the party here's a tidyverse solution that may be worth considering:

library(tidyverse)
df %>%
  # if there are at least two lower-case chars...
  mutate(V1 = ifelse(str_detect(V1, "[a-z]{2,}"),
                     # ...extract them in pairs,...
                     str_extract_all(V1, "..?"),
                     # ...else, leave `V1` as-is:
                     V1)) %>%
  # cast the listed values in long format:
  unnest_longer(V1)
# A tibble: 24 × 1
   V1      
   <chr>   
 1 >A1_[Er]
 2 aa      
 3 bb      
 4 bc      
 5 cc      
 6 c       
 7 >B2_[Br]
 8 dd      
 9 ee      
10 ee      
# … with 14 more rows

score 2 · Answer 4 · answered Mar 28 '23 at 16:55

We can use tidyverse as

library(dplyr)
library(tidyr)
library(stringr)
df %>%
  mutate(V1 = str_replace_all(V1, "([a-z]{2})", "\\1,")) %>% 
  separate_longer_delim(V1, delim = ",")

-output

        V1
1  >A1_[Er]
2        aa
3        aa
4        bb
5        bc
6        cc
7         c
8  >B2_[Br]
9        dd
10       dd
11       ee
12       ee
13       ef
14        f
15 >C3_[Gh]
16       gg
17       gg
18       gg
19       gh
20       hh
21       hh
22       ii
23       ii
24       ij
25       jj
26       jj
27        j

Split string of characters contained in a row of a data frame by a fixed number of characters and store the resultant fragments in subsequent rows

4 Answers4