I have a data frame that looks like this. names and number of columns will NOT be consistent (sometimes 'C' will not be present, other times "D', 'E', 'F' may be present, etc.). The only consistent variable will always be Y, and I want to regress against Y.
# name and number of columns varies...so need flexible process
Y <- c(4, 4, 3, 4, 3, 2, 3, 2, 2, 3, 4, 4, 3, 4, 8, 6, 5, 4, 3, 6)
A <- c(1, 2, 1, 2, 3, 2, 1, 1, 1, 2, 1, 4, 3, 1, 2, 2, 1, 2, 4, 8)
B <- c(5, 6, 6, 5, 3, 7, 2, 1, 1, 2, 7, 4, 7, 8, 5, 7, 6, 6, 4, 7)
C <- c(9, 1, 2, 2, 1, 4, 5, 6, 7, 8, 89, 9, 7, 6, 5, 6, 8, 9 , 67, 6)
YABC <- data.frame(Y, A, B, C)
I want to loop through each variable and collect output from regression model.
This process creates the desired output, but only for this specific iteration.
model_A <- lm(Y ~ A, YABC)
ID <- 'A'
rsq <- summary(model_A)$r.squared
adj_rsq <- summary(model_A)$adj.r.squared
sig <- summary(model_A)$sigma
datA <- data.frame(ID, rsq, adj_rsq, sig)
model_B <- lm(Y ~ B, YABC)
ID <- 'B'
rsq <- summary(model_B)$r.squared
adj_rsq <- summary(model_B)$adj.r.squared
sig <- summary(model_B)$sigma
datB <- data.frame(ID, rsq, adj_rsq, sig)
model_C <- lm(Y ~ C, YABC)
ID <- 'C'
rsq <- summary(model_C)$r.squared
adj_rsq <- summary(model_C)$adj.r.squared
sig <- summary(model_C)$sigma
datC <- data.frame(ID, rsq, adj_rsq, sig)
output <- rbind(datA, datB, datC)
How can I wrap this in a loop or some other process that will account for varied number and name of columns? Here is my attempt...yes I know it's not right, just me conceptualizing the kind of capability I'd like.
# initialize data frame
output__ <- data.frame(ID__ = as.character(),
rsq__ = as.numeric(),
adj_rsq__ = as.numeric(),
sig__ = as.numeric())
# loop through A, then B, then C
for(i in A:C) {
model_[i] <- lm(Y ~ [i], YABC)
ID <- '[i]'
rsq <- summary(model_[i])$r.squared
adj_rsq <- summary(model_[i])$adj.r.squared
sig <- summary(model_[i])$sigma
data__temp <- (ID__, rsq__, adj_rsq__, sig__)
data__ <- rbind(data__, data__temp)
}
Using @BigDataScientist approach...here is the solution I went with.
# initialize data frame
data__ <- data.frame(ID__ = as.character(),
rsq__ = as.numeric(),
adj_rsq__ = as.numeric(),
sig__ = as.numeric())
# loop through A, then B, then C
for(char in names(YABC)[-1]){
model <- lm(as.formula(paste("Y ~ ", char)), YABC)
ID__ <- paste(char)
rsq__ <- summary(model)$r.squared
adj_rsq__ <- summary(model)$adj.r.squared
sig__ <- summary(model)$sigma
data__temp <- data.frame(ID__, rsq__, adj_rsq__, sig__)
data__ <- rbind(data__, data__temp)
}