Selecting n cells below matched column cell sequence

Question

I need to select n cells below the matched column sequence. Let's assume n = 2 and the specific column sequence is A, B and C. After this sequence, I would like to pick up two more cells below this sequence. My table is:

Table1 <- data.frame(ID=rep(c(1 ,2  ,3  ,4  ,5  ,6  ,7  ,8  ,9  ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 ,27 ,28 ,29 ,30 ,31)), Sequence=rep(c("A",  "B",    "D",    "E",    "A",    "B",    "C",    "f",    "n",    "p",    "C",    "D",    "D",    "E",    "A",    "B",    "C",    "z",    "t",    "g",    "A",    "C",    "D",    "A",    "B",    "C",    "p",    "l",    "x",    "v",    "A")))

I like to have this Table:

Table2 <- data.frame(ID=rep(c(1 ,2  ,3  ,4  ,5  ,6  ,7  ,8  ,9  ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 ,27 ,28 ,29 ,30 ,31)), Sequence=rep(c("A",  "B",    "D",    "E",    "A",    "B",    "C",    "f",    "n",    "p",    "C",    "D",    "D",    "E",    "A",    "B",    "C",    "z",    "t",    "g",    "A",    "C",    "D",    "A",    "B",    "C",    "p",    "l",    "x",    "v",    "A")), Selected=rep(c("",   "", "", "", "A",    "B",    "C",    "f",    "n",    "", "", "", "", "", "A",    "B",    "C",    "z",    "t",    "", "", "", "", "A",    "B",    "C",    "p",    "l",    "", "", "")))

Can you please help me with this?

No, They are words. Small letters are actually numbers. Is it a problem? — Farshid Owrang, Sep 06 '21 at 12:45

score 3 · Answer 1 · answered Sep 06 '21 at 13:24

Another solution is to subset based on the pattern and merge to create your resulting column, i.e.

library(dplyr)
idx <- which(Table1$Sequence == "A" & lead(Table1$Sequence) == "B" & lead(Table1$Sequence, n = 2) == "C")
idx <- c(sapply(idx, function(i)seq(i, i+4)))
merge(Table1, Table1[idx,], by = 'ID', all = TRUE)

   ID Sequence.x Sequence.y
1   1          A       <NA>
2   2          B       <NA>
3   3          D       <NA>
4   4          E       <NA>
5   5          A          A
6   6          B          B
7   7          C          C
8   8          f          f
9   9          n          n
10 10          p       <NA>
11 11          C       <NA>
12 12          D       <NA>
13 13          D       <NA>
14 14          E       <NA>
15 15          A          A
16 16          B          B
17 17          C          C
18 18          z          z
19 19          t          t
20 20          g       <NA>
21 21          A       <NA>
22 22          C       <NA>
23 23          D       <NA>
24 24          A          A
25 25          B          B
26 26          C          C
27 27          p          p
28 28          l          l
29 29          x       <NA>
30 30          v       <NA>
31 31          A       <NA>

score 3 · Answer 2 · answered Sep 06 '21 at 13:27

Using base split and cumsum:

x = c("A", "B", "C")
do.call(rbind, 
        lapply(split(Table1, cumsum(Table1$Sequence == x[ 1 ])),
               function(i){
                 s <- sum(i$Sequence[1:3] == x)
                 if(is.na(s) | s < 3){
                   cbind(i, res = NA)
                 } else { cbind(i, res = c(i$Sequence[1:5], rep(NA, nrow(i) - 5))) }
               }))

Output:

#      ID Sequence  res
# 1.1   1        A <NA>
# 1.2   2        B <NA>
# 1.3   3        D <NA>
# 1.4   4        E <NA>
# 2.5   5        A    A
# 2.6   6        B    B
# 2.7   7        C    C
# 2.8   8        f    f
# 2.9   9        n    n
# 2.10 10        p <NA>
# 2.11 11        C <NA>
# 2.12 12        D <NA>
# 2.13 13        D <NA>
# 2.14 14        E <NA>
# 3.15 15        A    A
# 3.16 16        B    B
# 3.17 17        C    C
# 3.18 18        z    z
# 3.19 19        t    t
# 3.20 20        g <NA>
# 4.21 21        A <NA>
# 4.22 22        C <NA>
# 4.23 23        D <NA>
# 5.24 24        A    A
# 5.25 25        B    B
# 5.26 26        C    C
# 5.27 27        p    p
# 5.28 28        l    l
# 5.29 29        x <NA>
# 5.30 30        v <NA>
# 6    31        A <NA>

You could as well choose for `""` in place of `NA` just for a better looking output. — Anoushiravan R, Sep 06 '21 at 13:35
@AnoushiravanR Thanks, I prefer NA for missing/no-data. OP can amend as per their need. Solution logic won't change. — zx8754, Sep 06 '21 at 13:37

score 3 · Answer 3 · answered Sep 06 '21 at 13:33

This could be an alternative way in base R:

within(Table1, {
  Res <- rep("", nrow(Table1))
  Res[rep(unlist(gregexpr("ABC.{2}", paste0(Sequence, collapse = ""), perl = TRUE)), each = 5) + 0:4] <- 
    Sequence[rep(unlist(gregexpr("ABC.{2}", paste0(Sequence, collapse = ""), perl = TRUE)), each = 5) + 0:4]
})

   ID Sequence Res
1   1        A    
2   2        B    
3   3        D    
4   4        E    
5   5        A   A
6   6        B   B
7   7        C   C
8   8        f   f
9   9        n   n
10 10        p    
11 11        C    
12 12        D    
13 13        D    
14 14        E    
15 15        A   A
16 16        B   B
17 17        C   C
18 18        z   z
19 19        t   t
20 20        g    
21 21        A    
22 22        C    
23 23        D    
24 24        A   A
25 25        B   B
26 26        C   C
27 27        p   p
28 28        l   l
29 29        x    
30 30        v    
31 31        A

score 2 · Accepted Answer · answered Sep 06 '21 at 12:46

n=2
vec <- rep(NA,nrow(df) )

for (k in 1:(nrow(df)-3)) {
  print(k)
  if (df$Sequence[k]=='A'& df$Sequence[k+1]=='B'&df$Sequence[k+2]=='C') {
    vec[k]='A'
    vec[k+1]='B'
    vec[k+2]='C'
    
    for(m in 0:(n-1)){
      
      vec[k+3+m] <- df$Sequence[k+3+m]
    }
      
  }
  
}
df$check <- vec

df[is.na(df)] <- " "

output

> df
   ID Sequence check
1   1        A      
2   2        B      
3   3        D      
4   4        E      
5   s        A     A
6   6        B     B
7   7        C     C
8   8        f     f
9   9        n     n
10 10        p      
11 11        C      
12 12        D      
13 13        D      
14 14        E      
15 15        A     A
16 16        B     B
17 17        C     C
18 18        z     z
19 19        t     t
20 20        g      
21 21        A      
22 22        C      
23 23        D      
24 24        A     A
25 25        B     B
26 26        C     C
27 27        p     p
28 28        I     I
29 29        x      
30 30        v      
31 31        A

score 2 · Answer 5 · answered Sep 06 '21 at 13:07

Here's a way using rolling operation from zoo. I have kept the intermediary columns so that it is easier to understand what is going on.

library(dplyr)
library(zoo)

n <- 2
pattern <- c('A', 'B', 'C')

Table1 %>%
  mutate(start_pattern = rollapply(Sequence, length(pattern), function(x) all(x %in% pattern), fill = FALSE, align = 'left'), 
         keep_values = rollapplyr(start_pattern, length(pattern) + n, any, fill = FALSE), 
         Selected = replace(Sequence, !keep_values, ''))

This returns -

#   ID Sequence start_pattern keep_values Selected
#1   1        A         FALSE       FALSE         
#2   2        B         FALSE       FALSE         
#3   3        D         FALSE       FALSE         
#4   4        E         FALSE       FALSE         
#5   5        A          TRUE        TRUE        A
#6   6        B         FALSE        TRUE        B
#7   7        C         FALSE        TRUE        C
#8   8        f         FALSE        TRUE        f
#9   9        n         FALSE        TRUE        n
#10 10        p         FALSE       FALSE         
#11 11        C         FALSE       FALSE         
#12 12        D         FALSE       FALSE         
#13 13        D         FALSE       FALSE         
#14 14        E         FALSE       FALSE         
#15 15        A          TRUE        TRUE        A
#16 16        B         FALSE        TRUE        B
#17 17        C         FALSE        TRUE        C
#18 18        z         FALSE        TRUE        z
#19 19        t         FALSE        TRUE        t
#20 20        g         FALSE       FALSE         
#21 21        A         FALSE       FALSE         
#22 22        C         FALSE       FALSE         
#23 23        D         FALSE       FALSE         
#24 24        A          TRUE        TRUE        A
#25 25        B         FALSE        TRUE        B
#26 26        C         FALSE        TRUE        C
#27 27        p         FALSE        TRUE        p
#28 28        l         FALSE        TRUE        l
#29 29        x         FALSE       FALSE         
#30 30        v         FALSE       FALSE         
#31 31        A         FALSE       FALSE

start_pattern return TRUE when it finds the start of the pattern that we are looking for (in this case c('A', 'B', 'C')). keep_values would return TRUE for all the start_pattern values + n positions.

score 2 · Answer 6 · answered Sep 06 '21 at 13:18

A possible tidyverse way could be

library(dplyr)
library(tidyr)

spec_seq <- c("A", "B", "C")

Table1 %>% 
  group_by(grp = cumsum(Sequence == spec_seq[1] & 
                          lead(Sequence) == spec_seq[2] &
                          lead(Sequence, n = 2) == spec_seq[3])) %>% 
  
  filter(grp > 0) %>%
  slice_head(n = 5) %>% 
  ungroup() %>% 
  right_join(Table1, by = "ID", suffix = c(".y", "")) %>% 
  arrange(ID) %>% 
  select(ID, Sequence, Selected = Sequence.y)

returning

# A tibble: 31 x 3
      ID Sequence Selected
   <dbl> <chr>    <chr>   
 1     1 A        NA      
 2     2 B        NA      
 3     3 D        NA      
 4     4 E        NA      
 5     5 A        A       
 6     6 B        B       
 7     7 C        C       
 8     8 f        f       
 9     9 n        n       
10    10 p        NA      
11    11 C        NA      
12    12 D        NA      
13    13 D        NA      
14    14 E        NA      
15    15 A        A       
16    16 B        B       
17    17 C        C       
18    18 z        z       
19    19 t        t       
20    20 g        NA      
21    21 A        NA      
22    22 C        NA      
23    23 D        NA      
24    24 A        A       
25    25 B        B       
26    26 C        C       
27    27 p        p       
28    28 l        l       
29    29 x        NA      
30    30 v        NA      
31    31 A        NA

If you don't want any NA you could pipe an additional replace_na(list(Selected = "")).

score 1 · Answer 7 · answered Sep 06 '21 at 14:41

Using rollapply from zoo, you can compare a sequence of values in Sequence with a given vector vec and see if identical. This will return a vector of positions idx corresponding to the first row of the detected pattern. From this, you can create a vector containing all positions to be selected from Sequence.

library(zoo)

vec <- c("A", "B", "C")
n <- 2
Table1$Selected <- ""

idx <- which(rollapply(Table1$Sequence, length(vec), identical, vec))
idx_4 <- unlist(Map(seq, idx, idx + length(vec) + n - 1))
Table1$Selected[idx_4] <- Table1$Sequence[idx_4]
Table1

Output

   ID Sequence Selected
1   1        A         
2   2        B         
3   3        D         
4   4        E         
5   5        A        A
6   6        B        B
7   7        C        C
8   8        f        f
9   9        n        n
10 10        p         
11 11        C         
12 12        D         
13 13        D         
14 14        E         
15 15        A        A
16 16        B        B
17 17        C        C
18 18        z        z
19 19        t        t
20 20        g         
21 21        A         
22 22        C         
23 23        D         
24 24        A        A
25 25        B        B
26 26        C        C
27 27        p        p
28 28        l        l
29 29        x         
30 30        v         
31 31        A

Selecting n cells below matched column cell sequence

7 Answers7