R string_extract_all looped through data frame using plyr

Question

library(plyr)
library(stringr)



###example data
examp<- data.frame(id_info = c("123",   "3464", "7156", "3171", "5299", "4541", "4956", "9926", "8418", "1392", "9080", "6455", "2423", "9101", "7807", "5195", "7827", "365",  "9062", "5558", "239",  "8700", "6995", "9853"),
                   filterme1 = c("ABB123460sadjasd",    "ABB123461asjdjs",  "ABB123462ranogvmg",    "ABB123463dkfohsd", "ABB123464fff///sss",   "ABB123465jfsdf",   "ABB123466 sdf",    "ABB123467 sdf",    "ABB123468 fff///sss",  "ABB123469 ty", "ABB123470 fff///sss",  "ABB123471 dfs",    "ABB123472 ",   "ABB123473 gt", "ABB123474 y",  "ABB123475 f",  "ABB123476 gfgABB123462",   "ABB123477 dsd",    "ABB123478 re", "ABB123479 fgh",    "ABB123480 tu", "ABB123481 yu", "ABB123482 dfg",    "ABB123483 s"),
                    filterme2  = c("sadjasdABB123460",  "asjdjsABB123461",  "ranogvmgABB123462",    "dkfohsdABB123463", "fff///sssABB123464",   "jfsdfABB123465",   "sdfABB123466", "sdfABB123467", "fff///sssABB123468",   "tyABB123469",  "fff///sssABB123470",   "dfsABB123471", "ABB123472",    "gtABB123473",  "yABB123474",   "fABB123475",   "ABB123462gfgABB123476",    "dsdABB123477", "reABB123478",  "fghABB123479", "tuABB123480",  "yuABB123481",  "dfgABB123482", "sABB123483"))

##id_info should be factor even tho it is numeric
str(examp)

I want to extract an element from strings. The element should start with "ABB" and be followed followed by 6 digits.

#extract out all strings that begin with ABB and are followed by 7 digits
examp_str<-str_extract_all(as.character(examp$filterme1),pattern="ABB[0-9]{6}")
length(examp_str)
#thanks for the help with the expression

Below is my attempt to use my stringr function throughout the dataframe.

#Is this how I can eventually loop though the whole dataframe? I know I will create a list, but this isn't quite right.
examp_str_big<-dlply(.data=examp, 1,
                   function(x) str_extract_all(x,pattern="ABB[0-9]{6}"))

Once I create a list, I want to try and put it all back in a data frame.I found the link here about putting unknown lengths of lists into dataframes, but I am not sure if I could use this or not.

indx <- sapply(examp_str_big, length)
#indx <- lengths(lst) 
res <- as.data.frame(do.call(rbind,lapply(examp_str_big, `length<-`,
                                          max(indx))))

colnames(res) <- names(examp_str_big[[which.max(indx)]])
str(res)

So my desired end result would look like this:

id_info filterme1   filterme2   filterme3   filterme4
123     ABB123460               ABB123460   
3464    ABB123461               ABB123461   
7156    ABB123462               ABB123462   
3171    ABB123463               ABB123463   
5299    ABB123464               ABB123464   
4541    ABB123465               ABB123465   
4956    ABB123466               ABB123466   
9926    ABB123467               ABB123467   
8418    ABB123468               ABB123468   
1392    ABB123469               ABB123469   
9080    ABB123470               ABB123470   
6455    ABB123471               ABB123471   
2423    ABB123472               ABB123472   
9101    ABB123473               ABB123473   
7807    ABB123474               ABB123474   
5195    ABB123475               ABB123475   
7827    ABB123476   ABB123462   ABB123462   ABB1234576
365      ABB123477              ABB123477   
9062    ABB123478               ABB123478   
5558    ABB123479               ABB123479   
239     ABB123480               ABB123480   
8700    ABB123481               ABB123481   
6995    ABB123482               ABB123482   
9853    ABB123483               ABB123483

My actual datset is longer and has more "filterme" columns. Any help would be greatly appreciated. If there is another smarter way to accomplish this goal, I would love to hear it.

Thanks.

This pattern will match ABB followed by 7 digits as you mentioned: `'ABB[0-9]{7}'` — Gopala, Apr 28 '16 at 23:02
Yes that expression does work, Thank you. Do you have any ideas for the second part of the question? — Kathleen Brannen, Apr 28 '16 at 23:32
You may want to look at `unnest` from `tidyr` package. It provides the result in long format. — Gopala, Apr 29 '16 at 01:09

Gopala · Accepted Answer · 2016-04-29T21:40:41.520

Here is one approach (based on your original data frame examp, which I assume is read with stringsAsFactors = FALSE):

library(stringr)
# Extract all occurrences of patterns (NOTE: I am using 6 digits
# since no 7 digits example in provided sample data. Modify to 7.
examp$pattern <- str_extract_all(examp$filterme1, 'ABB[0-9]{6}')

# Append columns for each found pattern
maxlength <- max(sapply(examp$pattern, length))
examp <- cbind(examp,
               as.data.frame(do.call(rbind,
                                     lapply(examp$pattern,
                                            function(x) {
                                              s <- strsplit(x, ' ')
                                              c(s, rep(NA, maxlength - length(s)))
                                              }))))

# This will result in a wider data frame with all found patterns appended
# as new columns

examp

   id_info              filterme1             filterme2              pattern        V1
1      123       ABB123460sadjasd      sadjasdABB123460            ABB123460 ABB123460
2     3464        ABB123461asjdjs       asjdjsABB123461            ABB123461 ABB123461
3     7156      ABB123462ranogvmg     ranogvmgABB123462            ABB123462 ABB123462
4     3171       ABB123463dkfohsd      dkfohsdABB123463            ABB123463 ABB123463
5     5299     ABB123464fff///sss    fff///sssABB123464            ABB123464 ABB123464
6     4541         ABB123465jfsdf        jfsdfABB123465            ABB123465 ABB123465
7     4956          ABB123466 sdf          sdfABB123466            ABB123466 ABB123466
8     9926          ABB123467 sdf          sdfABB123467            ABB123467 ABB123467
9     8418    ABB123468 fff///sss    fff///sssABB123468            ABB123468 ABB123468
10    1392           ABB123469 ty           tyABB123469            ABB123469 ABB123469
11    9080    ABB123470 fff///sss    fff///sssABB123470            ABB123470 ABB123470
12    6455          ABB123471 dfs          dfsABB123471            ABB123471 ABB123471
13    2423             ABB123472              ABB123472            ABB123472 ABB123472
14    9101           ABB123473 gt           gtABB123473            ABB123473 ABB123473
15    7807            ABB123474 y            yABB123474            ABB123474 ABB123474
16    5195            ABB123475 f            fABB123475            ABB123475 ABB123475
17    7827 ABB123476 gfgABB123462 ABB123462gfgABB123476 ABB123476, ABB123462 ABB123476
18     365          ABB123477 dsd          dsdABB123477            ABB123477 ABB123477
19    9062           ABB123478 re           reABB123478            ABB123478 ABB123478
20    5558          ABB123479 fgh          fghABB123479            ABB123479 ABB123479
21     239           ABB123480 tu           tuABB123480            ABB123480 ABB123480
22    8700           ABB123481 yu           yuABB123481            ABB123481 ABB123481
23    6995          ABB123482 dfg          dfgABB123482            ABB123482 ABB123482
24    9853            ABB123483 s            sABB123483            ABB123483 ABB123483
          V2
1         NA
2         NA
3         NA
4         NA
5         NA
6         NA
7         NA
8         NA
9         NA
10        NA
11        NA
12        NA
13        NA
14        NA
15        NA
16        NA
17 ABB123462
18        NA
19        NA
20        NA
21        NA
22        NA
23        NA
24        NA

In this case, only two new columns are added since there are a maximum of two occurrences of the pattern (even modified to 6 above) in the provided sample data.

EDIT: Adding code that matches pattern across multiple columns (in this case filterme1 and filterme2):

library(tidyr)
examp <- unite(examp, filterme, filterme1, filterme2, remove = FALSE)
examp$pattern <- str_extract_all(examp$filterme, 'ABB[0-9]{6}')

At this point, you can run the rest of the code above AFTER the line where examp$pattern is assigned.

Thank you for the quick response. When I run this code, I get an error: `Error in FUN(X[[i]], ...) : object 'maxlength' not found` — Kathleen Brannen, Apr 29 '16 at 20:30
So sorry...one of the lines of code I wrote is missing. Edited. — Gopala, Apr 29 '16 at 20:54
this works great if I wanted the function to be applied to only the first column of my data. I do not see that it is able to apply str_extract_all in the second column, or (n) columns in a data frame. — Kathleen Brannen, Apr 29 '16 at 21:00
You mean, the pattern must be extracted from all (or multiple columns) of a row and then this code run on that extracted `list`? You can look at `unite` in tidyr to make things easier for you. You can unite multiple columns into one (make sure to use `remove = FALSE)` and then run this same code. You can say `examp <- unite(examp, allFilterme, filterme1, filterme2, ...., remove = FALSE)`. Only small modification needed. — Gopala, Apr 29 '16 at 21:26
Ah, so essentially put this dataframe in long format and then apply this function? I will give it a go. — Kathleen Brannen, Apr 29 '16 at 21:28
No need to put in long format. `unite` just adds a column by pasting together multiple columns. — Gopala, Apr 29 '16 at 21:28
I melted examp, and then everything worked perfectly. If you add in the unite or melting I will accept your answer. Thank you! — Kathleen Brannen, Apr 29 '16 at 21:41
Added working code with combined columns using `unite`. Take a look. — Gopala, Apr 29 '16 at 21:41

akrun · Answer 2 · 2016-04-29T03:45:34.203

We can also use the lengths function

 lst <- str_extract_all(examp$filterme1, 'ABB[0-9]{6}')
 m1 <- do.call(rbind, lapply(lst, `length<-`, max(lengths(lst))))
 examp[paste0("pattern", seq_len(ncol(m1)))] <- m1
 examp
#   id_info              filterme1             filterme2  pattern1  pattern2
#1      123       ABB123460sadjasd      sadjasdABB123460 ABB123460      <NA>
#2     3464        ABB123461asjdjs       asjdjsABB123461 ABB123461      <NA>
#3     7156      ABB123462ranogvmg     ranogvmgABB123462 ABB123462      <NA>
#4     3171       ABB123463dkfohsd      dkfohsdABB123463 ABB123463      <NA>
#5     5299     ABB123464fff///sss    fff///sssABB123464 ABB123464      <NA>
#6     4541         ABB123465jfsdf        jfsdfABB123465 ABB123465      <NA>
#7     4956          ABB123466 sdf          sdfABB123466 ABB123466      <NA>
#8     9926          ABB123467 sdf          sdfABB123467 ABB123467      <NA>
#9     8418    ABB123468 fff///sss    fff///sssABB123468 ABB123468      <NA>
#10    1392           ABB123469 ty           tyABB123469 ABB123469      <NA>
#11    9080    ABB123470 fff///sss    fff///sssABB123470 ABB123470      <NA>
#12    6455          ABB123471 dfs          dfsABB123471 ABB123471      <NA>
#13    2423             ABB123472              ABB123472 ABB123472      <NA>
#14    9101           ABB123473 gt           gtABB123473 ABB123473      <NA>
#15    7807            ABB123474 y            yABB123474 ABB123474      <NA>
#16    5195            ABB123475 f            fABB123475 ABB123475      <NA>
#17    7827 ABB123476 gfgABB123462 ABB123462gfgABB123476 ABB123476 ABB123462
#18     365          ABB123477 dsd          dsdABB123477 ABB123477      <NA>
#19    9062           ABB123478 re           reABB123478 ABB123478      <NA>
#20    5558          ABB123479 fgh          fghABB123479 ABB123479      <NA>
#21     239           ABB123480 tu           tuABB123480 ABB123480      <NA>
#22    8700           ABB123481 yu           yuABB123481 ABB123481      <NA>
#23    6995          ABB123482 dfg          dfgABB123482 ABB123482      <NA>
#24    9853            ABB123483 s            sABB123483 ABB123483      <NA>

This works, thanks. However, I would like to apply the str_extract_all function to all the other columns in the data frame. The example has only 2 columns, but my larger data set has 16. — Kathleen Brannen, Apr 29 '16 at 20:37
@KathleenBrannen You can loop through the columns and apply the same. i.e `lapply(examp, function(x) {lst <- str_extract_all(x, 'ABB[0-9]{6}');...` — akrun, Apr 30 '16 at 02:50

R string_extract_all looped through data frame using plyr

2 Answers2