library(plyr)
library(stringr)
###example data
examp<- data.frame(id_info = c("123", "3464", "7156", "3171", "5299", "4541", "4956", "9926", "8418", "1392", "9080", "6455", "2423", "9101", "7807", "5195", "7827", "365", "9062", "5558", "239", "8700", "6995", "9853"),
filterme1 = c("ABB123460sadjasd", "ABB123461asjdjs", "ABB123462ranogvmg", "ABB123463dkfohsd", "ABB123464fff///sss", "ABB123465jfsdf", "ABB123466 sdf", "ABB123467 sdf", "ABB123468 fff///sss", "ABB123469 ty", "ABB123470 fff///sss", "ABB123471 dfs", "ABB123472 ", "ABB123473 gt", "ABB123474 y", "ABB123475 f", "ABB123476 gfgABB123462", "ABB123477 dsd", "ABB123478 re", "ABB123479 fgh", "ABB123480 tu", "ABB123481 yu", "ABB123482 dfg", "ABB123483 s"),
filterme2 = c("sadjasdABB123460", "asjdjsABB123461", "ranogvmgABB123462", "dkfohsdABB123463", "fff///sssABB123464", "jfsdfABB123465", "sdfABB123466", "sdfABB123467", "fff///sssABB123468", "tyABB123469", "fff///sssABB123470", "dfsABB123471", "ABB123472", "gtABB123473", "yABB123474", "fABB123475", "ABB123462gfgABB123476", "dsdABB123477", "reABB123478", "fghABB123479", "tuABB123480", "yuABB123481", "dfgABB123482", "sABB123483"))
##id_info should be factor even tho it is numeric
str(examp)
I want to extract an element from strings. The element should start with "ABB" and be followed followed by 6 digits.
#extract out all strings that begin with ABB and are followed by 7 digits
examp_str<-str_extract_all(as.character(examp$filterme1),pattern="ABB[0-9]{6}")
length(examp_str)
#thanks for the help with the expression
Below is my attempt to use my stringr function throughout the dataframe.
#Is this how I can eventually loop though the whole dataframe? I know I will create a list, but this isn't quite right.
examp_str_big<-dlply(.data=examp, 1,
function(x) str_extract_all(x,pattern="ABB[0-9]{6}"))
Once I create a list, I want to try and put it all back in a data frame.I found the link here about putting unknown lengths of lists into dataframes, but I am not sure if I could use this or not.
indx <- sapply(examp_str_big, length)
#indx <- lengths(lst)
res <- as.data.frame(do.call(rbind,lapply(examp_str_big, `length<-`,
max(indx))))
colnames(res) <- names(examp_str_big[[which.max(indx)]])
str(res)
So my desired end result would look like this:
id_info filterme1 filterme2 filterme3 filterme4
123 ABB123460 ABB123460
3464 ABB123461 ABB123461
7156 ABB123462 ABB123462
3171 ABB123463 ABB123463
5299 ABB123464 ABB123464
4541 ABB123465 ABB123465
4956 ABB123466 ABB123466
9926 ABB123467 ABB123467
8418 ABB123468 ABB123468
1392 ABB123469 ABB123469
9080 ABB123470 ABB123470
6455 ABB123471 ABB123471
2423 ABB123472 ABB123472
9101 ABB123473 ABB123473
7807 ABB123474 ABB123474
5195 ABB123475 ABB123475
7827 ABB123476 ABB123462 ABB123462 ABB1234576
365 ABB123477 ABB123477
9062 ABB123478 ABB123478
5558 ABB123479 ABB123479
239 ABB123480 ABB123480
8700 ABB123481 ABB123481
6995 ABB123482 ABB123482
9853 ABB123483 ABB123483
My actual datset is longer and has more "filterme" columns. Any help would be greatly appreciated. If there is another smarter way to accomplish this goal, I would love to hear it.
Thanks.