I asked a similar question about this at R - generate dynamic number of columns and substring column values but the details in my question have now changed and so I am reposting as I would require a different solution.
I have attached a picture which illustrates my starting dataset and the end point that I am trying to achieve. I need an R solution, which uses base R as the platform I will be using cannot utilise other packages.
The original data set has multiple columns. For some of the columns i.e. L1, L2, L3, I want to;
1) Generate a dynamic number of columns based on the maximum length of any string in the column e.g. L1 max length = 6, therefore 6 new columns each labelled 'L1_1' to 'L1_6'
2) Separate the original string into substrings, each containing 3 characters starting from the left. the penultimate column will contain 2 characters, the final column will contain 1 character. (different to original question)
3) perform a calculation on these substrings i.e. (number of 'a' * 1) + (number of 'b' * 3) + (number of 'c'*7) and return the value of this calculation in the new column.
Does anybody have any ideas about how to do this?
Thanks in advance.
dput(original_data):
structure(list(ID = 1:5, L1 = structure(c(3L, 2L, 4L, 1L, 5L), .Label = c("", "AAAAAA", "AABBCC", "BBACB", "BCBDAB"), class = "factor"), L2 = structure(c(3L,
4L, 3L, 1L, 2L), .Label = c("", "ACAA", "BACA", "BACBA"), class = "factor"), L3 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "CABAC", "CACCC", "CBABA"), class = "factor")), .Names = c("ID", "L1",
"L2", "L3"), class = "data.frame", row.names = c(NA, -5L))
dput(interim_data):
structure(list(ID = 1:5, L1 = structure(c(3L, 2L, 4L, 1L, 5L), .Label = c("",
"AAAAAA", "AABBCC", "BBACB", "BCBDAB"), class = "factor"), L2 = structure(c(3L,
4L, 3L, 1L, 2L), .Label = c("", "ACAA", "BACA", "BACBA"), class = "factor"),
L3 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "CABAC",
"CACCC", "CBABA"), class = "factor"), L1_1 = structure(c(3L,
2L, 4L, 1L, 5L), .Label = c("", "AAA", "AAB", "BBA", "BCB"
), class = "factor"), L1_2 = structure(c(3L, 2L, 4L, 1L,
5L), .Label = c("", "AAA", "ABB", "BAC", "CBD"), class = "factor"),
L1_3 = structure(c(4L, 2L, 3L, 1L, 5L), .Label = c("", "AAA",
"ACB", "BBC", "BDA"), class = "factor"), L1_4 = structure(c(3L,
2L, 4L, 1L, 5L), .Label = c("", "AAA", "BCC", "CB", "DAB"
), class = "factor"), L1_5 = structure(c(5L, 2L, 4L, 1L,
3L), .Label = c("", "AA", "AB", "B", "CC"), class = "factor"),
L1_6 = structure(c(4L, 2L, 1L, 1L, 3L), .Label = c("", "A",
"B", "C"), class = "factor"), L2_1 = structure(c(3L, 3L,
3L, 1L, 2L), .Label = c("", "ACA", "BAC"), class = "factor"),
L2_2 = structure(c(2L, 3L, 2L, 1L, 4L), .Label = c("", "ACA",
"ACB", "CAA"), class = "factor"), L2_3 = structure(c(3L,
4L, 3L, 1L, 2L), .Label = c("", "AA", "AC", "CBA"), class = "factor"),
L2_4 = structure(c(2L, 3L, 2L, 1L, 2L), .Label = c("", "A",
"BA"), class = "factor"), L2_5 = structure(c(1L, 2L, 1L,
1L, 1L), .Label = c("", "A"), class = "factor"), L3_1 = structure(c(1L,
3L, 2L, 1L, 4L), .Label = c("", "CAB", "CAC", "CBA"), class = "factor"),
L3_2 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "ABA",
"ACC", "BAB"), class = "factor"), L3_3 = structure(c(1L,
4L, 3L, 1L, 2L), .Label = c("", "ABA", "BAC", "CCC"), class = "factor"),
L3_4 = structure(c(1L, 4L, 2L, 1L, 3L), .Label = c("", "AC",
"BA", "CC"), class = "factor"), L3_5 = structure(c(1L, 3L,
3L, 1L, 2L), .Label = c("", "A", "C"), class = "factor")), .Names = c("ID",
"L1", "L2", "L3", "L1_1", "L1_2", "L1_3", "L1_4", "L1_5", "L1_6",
"L2_1", "L2_2", "L2_3", "L2_4", "L2_5", "L3_1", "L3_2", "L3_3",
"L3_4", "L3_5"), class = "data.frame", row.names = c(NA, -5L))
EDIT: Code provided by @Onyambu;
interim=sapply(df, as.character)
interim[,1]=as.numeric(interim[,1]
funfun = function(u){
if(is.numeric(u)) return(u)
s = unique(unlist(strsplit(u,"")))
w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}
ADD_char=function(x) mapply(funfun,x)
sapply(interim,ADD_char)
dat1 <- cbind(interim[,1:4],sapply(interim[,-(1:4)],ADD_char))
Results in parse error