1

I am trying to extract both a letter (should be K or Y) and all digits between that letter and the pattern (XO44_TMT6) and put extracted values in two separate columns (Mod.residue and Mod.position.in.pep), but failed to get what I want.

Below are my codes and data frame. Can anyone explain why my codes failed and how to fix that?

Thanks so much!

My data frame:

structure(list(Modifications = c("Y9(XO44_TMT6)", "Y9(XO44_TMT6)", 
"Y9(XO44_TMT6)", "Y9(XO44_TMT6)", "Y9(XO44_TMT6)", "Y9(XO44_TMT6)", 
"Y9(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", 
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", 
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", 
"Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", "Y8(XO44_TMT6)", 
"Y8(XO44_TMT6)", "Y7(XO44_TMT6); M9(Oxidation)", "Y7(XO44_TMT6); M8(Oxidation)", 
"Y7(XO44_TMT6); M8(Oxidation)", "Y7(XO44_TMT6); C9(Carbamidomethyl); C18(Carbamidomethyl)", 
"Y7(XO44_TMT6); C15(Carbamidomethyl)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", "Y7(XO44_TMT6)", 
"Y6(XO44_TMT6); C23(Carbamidomethyl)", "Y6(XO44_TMT6); C12(Carbamidomethyl)", 
"Y6(XO44_TMT6); C12(Carbamidomethyl)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", 
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", 
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y6(XO44_TMT6)", 
"Y6(XO44_TMT6)", "Y6(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", 
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", 
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", 
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", 
"Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y5(XO44_TMT6)", "Y4(XO44_TMT6); C29(Carbamidomethyl)", 
"Y4(XO44_TMT6); C13(Carbamidomethyl)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", 
"Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", 
"Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", "Y4(XO44_TMT6)", 
"Y4(XO44_TMT6)", "Y3(XO44_TMT6); M5(Oxidation)", "Y3(XO44_TMT6); C11(Carbamidomethyl)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", "Y3(XO44_TMT6)", 
"Y29(XO44_TMT6)", "Y25(XO44_TMT6)", "Y25(XO44_TMT6)", "Y25(XO44_TMT6)", 
"Y23(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", 
"Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", "Y22(XO44_TMT6)", 
"Y21(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", 
"Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", "Y20(XO44_TMT6)", 
"Y20(XO44_TMT6)", "Y2(XO44_TMT6); C8(Carbamidomethyl)", "Y2(XO44_TMT6); C19(Carbamidomethyl)", 
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6); C15(Carbamidomethyl)", 
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6); C15(Carbamidomethyl)", 
"Y2(XO44_TMT6); C15(Carbamidomethyl)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", 
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", 
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", 
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y2(XO44_TMT6)", 
"Y2(XO44_TMT6)", "Y2(XO44_TMT6)", "Y19(XO44_TMT6)", "Y19(XO44_TMT6)", 
"Y19(XO44_TMT6)", "Y19(XO44_TMT6)", "Y19(XO44_TMT6)", "Y18(XO44_TMT6)", 
"Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y18(XO44_TMT6)", 
"Y18(XO44_TMT6)", "Y18(XO44_TMT6)", "Y17(XO44_TMT6)", "Y17(XO44_TMT6)", 
"Y17(XO44_TMT6)", "Y17(XO44_TMT6)", "Y17(XO44_TMT6)", "Y16(XO44_TMT6)", 
"Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", 
"Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", "Y16(XO44_TMT6)", 
"Y16(XO44_TMT6)", "Y15(XO44_TMT6); C16(Carbamidomethyl)", "Y15(XO44_TMT6)", 
"Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", 
"Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", "Y15(XO44_TMT6)", 
"Y14(XO44_TMT6); C15(Carbamidomethyl)", "Y14(XO44_TMT6); C15(Carbamidomethyl)", 
"Y14(XO44_TMT6)", "Y14(XO44_TMT6)", "Y14(XO44_TMT6)", "Y13(XO44_TMT6)", 
"Y13(XO44_TMT6)", "Y13(XO44_TMT6)", "Y12(XO44_TMT6); C14(Carbamidomethyl)", 
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", 
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", 
"Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", "Y12(XO44_TMT6)", 
"Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)", 
"Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y11(XO44_TMT6)", "Y10(XO44_TMT6)", 
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", 
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", 
"Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y10(XO44_TMT6)", "Y1(XO44_TMT6); C9(Carbamidomethyl)", 
"Y1(XO44_TMT6); C15(Carbamidomethyl)", "Y1(XO44_TMT6); C15(Carbamidomethyl)", 
"Y1(XO44_TMT6); C15(Carbamidomethyl)", "Y1(XO44_TMT6); C15(Carbamidomethyl)", 
"Y1(XO44_TMT6); C11(Carbamidomethyl)", "Y1(XO44_TMT6); C11(Carbamidomethyl)", 
"Y1(XO44_TMT6); C11(Carbamidomethyl)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", 
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", 
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", 
"Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", "Y1(XO44_TMT6)", 
"Y1(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", 
"N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", 
"N-Term(Prot)(Met-loss+Acetyl); Y9(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y8(XO44_TMT6)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6); C17(Carbamidomethyl)", 
"N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6)", "N-Term(Prot)(Met-loss+Acetyl); Y7(XO44_TMT6)", 
"N-Term(Prot)(Met-loss+Acetyl); K4(XO44_TMT6)", "N-Term(Prot)(Met-loss); Y8(XO44_TMT6)", 
"N-Term(Prot)(Met-loss); Y8(XO44_TMT6)", "N-Term(Prot)(Met-loss); Y8(XO44_TMT6)", 
"N-Term(Prot)(Met-loss); Y8(XO44_TMT6)"), Mod.residue = c("9", 
"9", "9", "9", "9", "9", "9", "8", "8", "8", "8", "8", "8", "8", 
"8", "8", "8", "8", "8", "8", "8", "8", "8", "7", "7", "7", "7", 
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", 
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", 
"7", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", 
"6", "6", "6", "5", "5", "5", "5", "5", "5", "5", "5", "5", "5", 
"5", "5", "5", "5", "5", "5", "5", "4", "4", "4", "4", "4", "4", 
"4", "4", "4", "4", "4", "4", "4", "3", "3", "3", "3", "3", "3", 
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", 
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "9", "5", 
"5", "5", "3", "2", "2", "2", "2", "2", "2", "2", "1", "0", "0", 
"0", "0", "0", "0", "0", "0", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "9", "9", "9", "9", "9", "8", "8", "8", "8", "8", 
"8", "8", "7", "7", "7", "7", "7", "6", "6", "6", "6", "6", "6", 
"6", "6", "6", "6", "5", "5", "5", "5", "5", "5", "5", "5", "5", 
"5", "4", "4", "4", "4", "4", "3", "3", "3", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "1", "1", "1", "1", 
"1", "1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", 
"0", "0", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", 
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "9", 
"9", "9", "9", "8", "7", "7", "7", "7", "7", "7", "7", "7", "7", 
"7", "7", "7", "7", "7", "7", "7", "7", "4", "8", "8", "8", "8"
), Mod.position.in.pep = c("", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "")), row.names = c(NA, -300L), class = "data.frame")

My codes:

df <- df.test %>% 
  mutate(Mod.residue = gsub(".*(\\w{1})\\d*\\(XO44_TMT6)\\;*\\s*.*", "\\1", Modifications),
         Mod.position.in.pep = gsub(".*\\w{1}(\\d*)\\(XO44_TMT6\\)\\;*\\s*.*", "\\1", Modifications)
           )
GuedesBF
  • 8,409
  • 5
  • 19
  • 37
  • I just realised I already answered a (different) question of yours regarding separation/concatenation of text across columns. I strongly recommend you get a good grip of the AWESOME `tidyr` package: https://tidyr.tidyverse.org/ For string manipulation, I also think `stringr` is much more consistent and easier to use than base R functions: https://stringr.tidyverse.org/ – GuedesBF Sep 07 '21 at 22:17
  • Thanks so much for the recommendation! Will learn how to use `tidyr` package. – Chemist learns to code Sep 07 '21 at 22:30
  • I just posted a follow-up, similar question to see if somone comes up with something better: https://stackoverflow.com/questions/69095697/extract-all-matches-of-a-pattern-and-concatenate-output-with-mutate – GuedesBF Sep 08 '21 at 00:19

2 Answers2

2

I think your are looking for tidyr::extract, which fits perfectly for your purpose in a single function call.

library(tidyr)

output_extract<-df %>%
        extract(Modifications,
               into = c('Mod.residue', 'Mod.position.in.pep'),
               regex = ".*([A-Z])(\\d+)(?=\\(XO44_TMT6\\)).*",
               remove=FALSE)

If you want to keep using gsub, you can do it like this (same pattern, two different replacements(\\1 and \\2):

output_gsub<-df %>% mutate(Mod.residue=gsub(".*([A-Z])(\\d+)(\\(XO44_TMT6\\)).*", "\\1", Modifications),
              Mod.position.in.pep=gsub(".*([A-Z])(\\d+)(\\(XO44_TMT6\\)).*", "\\2", Modifications))

These methods give the same outputs:

identical(output_extract, output_gsub)

[1] TRUE

You may want to convert the "Mod.position.in.pep" variable to numeric with as.numeric afterwards.

Unique values for the output columns:

$Mod.residue
[1] "Y" "K"

$Mod.position.in.pep
 [1] "9"  "8"  "7"  "6"  "5"  "4"  "3"  "29" "25" "23" "22" "21" "20" "2"  "19" "18" "17" "16" "15" "14" "13" "12" "11" "10" "1" 

EDIT

This will only work whe there is a single match for "Mod.residue" or "Mod.position.in.pep".

If you have multiple "[KY]digit(XO44_TMT6)" per observation, you may have to take a more complex approach, with mutate %>% unnest_wider %>% unite

#Example data
df<-tibble(Modifications="K4(XO44_TMT6); Y6(XO44_TMT6)")

#solution
library(dplyr)
library(tidyr)
library(stringr)

df %>% mutate(Mod.residue=str_extract_all(Modifications, "[A-Z]+(?=\\d+\\(XO44_TMT6\\))"),
              Mod.position.in.pep=str_extract_all(Modifications, "\\d+(?=\\(XO44_TMT6\\))"))%>%
        unnest_wider(col='Mod.residue', names_sep = "_")%>%
        unnest_wider(col='Mod.position.in.pep', names_sep = "_")%>%
        unite(starts_with('Mod.residue'), col="Mod.residue", sep = ';', remove=TRUE, na.rm=TRUE)%>%
        unite(starts_with('Mod.position'), col='Mod.position.in.pep', sep=';', remove=TRUE, na.rm=TRUE)

Output

# A tibble: 1 x 3
  Modifications                Mod.residue Mod.position.in.pep
  <chr>                        <chr>       <chr>              
1 K4(XO44_TMT6); Y6(XO44_TMT6) K;Y         4;6 
GuedesBF
  • 8,409
  • 5
  • 19
  • 37
2

I would first create a column in which the whole residue inscl. the position is extracted. Then you can separate the name and the position based on this column and remove the intermediate result afterwards:

df.test %>% 
  mutate(
    residue = Modifications %>% str_extract("[KY][0-9]+"),
    Mod.residue = residue %>% str_extract("^[KY]"),
    Mod.position.in.pep = residue %>% str_extract("[0-9]") %>% as.numeric(),
  ) %>%
  select(-residue)

str_extract is designed to work with the other tidyverse functions.

danlooo
  • 10,067
  • 2
  • 8
  • 22