I am trying to write a loop for repeated k-fold cross validation. Basically trying to perform a 10-fold cross validation and repeat the process 10-times to get the predictions and the resulting 10 AUC values.
I seem to be missing something in the loop that allows moving the calculated predictions to the respective column of the empty dataframe created for the k-fold results. I only get the last k-fold scores in my output ...instead of all 10. I still have to get the auc values for each k-fold validation.
Is there a way to incorporate the auc calculation in the loop to get the values? Would greatly appreciate if someone can guide me with this.
library(cvTools)
library (glmnet)
#library(pROC)
k <- 10 #the number of folds
x <- structure(list(PC1 = c(-2.03456672313651, -1.73707505007147,
-2.03456672313652, -0.255368300655119, -1.73707505007143, -2.03456672313651,
-0.37500359723752, -2.03456672313651, -2.03456672313651, 3.47288460329945,
-0.734187869112349, -0.0134149056651377, 0.0942929078885968,
-2.0345667231365, -2.03456672313651), PC2 = c(0.112471741011579,
0.133858302549922, 0.1124717410116, 2.61374131070885, 0.133858302549994,
0.11247174101158, -0.158995891265301, 0.11247174101159, 0.112471741011592,
-0.260528749768208, -0.503925189558291, 0.194756984230433, 0.318778158034713,
0.112471741011598, 0.11247174101159), PC3 = c(2.44850389170835,
2.3403087394181, 2.44850389170835, -2.46949441441314, 2.34030873941815,
2.44850389170834, 0.123937826076267, 2.44850389170836, 2.44850389170835,
-0.367483430521022, -0.155846438581532, 0.509441984698824, 0.612816030555617,
2.44850389170836, 2.44850389170835), PC4 = c(0.112471741011652,
0.133858302549981, 0.11247174101165, 0.00436673840662417, 0.133858302549995,
0.112471741011656, -0.158995891265306, 0.112471741011666, 0.112471741011661,
-0.260528749768211, -0.290253126970872, -2.28110627358792, 0.318778158034689,
0.11247174101168, 0.11247174101167), PC5 = c(0.112471741011684,
0.13385830255004, 0.112471741011692, 0.00436673840662224, 0.133858302550053,
0.112471741011681, -0.158995891265284, 0.112471741011697, 0.112471741011696,
-0.260528749768212, 1.20999715739728, -1.91404159432553, 0.318778158034758,
0.112471741011709, 0.112471741011692)), .Names = c("PC1", "PC2",
"PC3", "PC4", "PC5"), row.names = c("O35245", "O35286", "O54949",
"O54991", "O88569", "P14733", "P16054", "P21619", "P24369", "P37889",
"P40201", "P57080", "P60843", "P63085", "P99029"), class = "data.frame")
folds <- cvFolds(NROW(x), K=k)
mypreds <- data.frame(matrix(0, nrow(x),ncol = 10)) # create a dataframe to store results of all 10 k-fold repetititions
row.names(mypreds) <- row.names(x) # row names for the dataframe
names(mypreds) <- paste("K", (1:10), sep = "") # column names
set.seed(123)
j <- 1
nsim = 10 # number of repetitions
x$kfoldlpred <- rep(0,nrow(x)) # append a column to original dataframe to temporarily store results of each k-fold
# the loop for repeated cross-validation
repeatcv <- function(){
while (j <= nsim){
for(i in 1:k){
train <- x[folds$subsets[folds$which != i], ] #Set the training set
train_response <- responseY1[folds$subsets[folds$which != i]] # set the training set response
validation <- x[folds$subsets[folds$which == i], ] #Set the validation set
lasso_newglm <- glmnet(as.matrix(train), train_response, alpha = 1,family = "binomial") #Get your new logistic regression model (just fit on the train data)
lasso_cvglm <- cv.glmnet(as.matrix(train), train_response, alpha = 1, family = "binomial",type.measure = "deviance")
lasso_newpred <- predict(lasso_newglm,newx = as.matrix(validation), type = "response", s = c(lasso_cvglm$lambda.min)) #Get the predicitons for the validation set (from the model just fit on the train data)
x[folds$subsets[folds$which == i],]$kfoldlpred <- lasso_newpred
}
mypreds[,i] <- x$kfoldlpred
j <- j+1
}
return(mypreds)
}