0

I am trying to take bootstrap samples and conduct stepwise regression in R. I am trying to look at the distribution of the regression coefficients, but the "bhat" matrix I am working on is mostly printing out NAs with the exception of the first row (which is the same number across all columns). How can I fix this?

B <- 100 #number of bootstrap samples
n <- nrow(dat)
d <- ncol(dat) - 1
bhat <- matrix(NA, nrow = B, ncol = ncol(dat) - 1)

for(b in 1:B) {

  s <- sample(1:n, n, replace=TRUE)
  samp <- as.matrix(dat[s, c(1:15)])
  samp <- as.data.frame(samp)

  #stepwise regression
  null <- lm(dat$Y ~ 1, data=samp)
  full <- lm(dat$Y ~ ., data=samp)
  fit <- step(null, scope=formula(full), direction="forward", k=log(n), trace=0)

  bhat[b,] <- coef(fit)

}
  • You may want to add data, to make this reproducible, please read: https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example/5963610#5963610 – jay.sf Apr 21 '20 at 07:53

1 Answers1

0

Every time you do a step the number of coefficients are different, it's not the same dimension as your matrix, and you cannot "slot" it in, you can do:

dat = data.frame(Y=rnorm(100),matrix(rnorm(100*15),ncol=15))
colnames(dat)[-1] = paste0("Var",1:15)

B <- 100 #number of bootstrap samples
n <- nrow(dat)
d <- ncol(dat) - 1
full_mdl = colnames(model.matrix(Y~.,data=dat))
bhat <- matrix(NA, nrow = B, ncol = length(full_mdl))
colnames(bhat) = full_mdl

for(b in 1:B) {

  samp <- dat[sample(1:n, n, replace=TRUE), c(1:15)]
  null <- lm(dat$Y ~ 1, data=samp)
  full <- lm(dat$Y ~ ., data=samp)
  fit <- step(null, scope=formula(full), direction="forward", k=log(n), trace=0)

  bhat[b,names(coef(fit))] <- coef(fit)

}
StupidWolf
  • 45,075
  • 17
  • 40
  • 72