0

I'm trying to apply logistic regression with an L1-penalty on the training set. I need to use 10-fold cross-validation to find an optimal value of the penalty parameter. Can anyone tell me why I get the same results for the model with x1-x3 as for the model with x1-x14?

Data:

GenerateData <- function( seed )
{
  set.seed( seed )

  # generate X
  x1 = runif(1500, -2, 2)
  x2 = rnorm(1500, mean = 4, sd = sqrt(2))
  x3 = rnorm(1500, mean = 0, sd = 1)
  x4 = runif(1500, -2, 2)
  x5 = runif(1500, -2, 2)
  x6 = runif(1500, -2, 2)
  x7 = runif(1500, -2, 2)
  x8 = runif(1500, -2, 2)
  x9 = runif(1500, -2, 2)
  x10 = runif(1500, -2, 2)
  x11 = runif(1500, -2, 2)
  x12 = runif(1500, -2, 2)
  x13 = runif(1500, -2, 2)
  x14 = runif(1500, -2, 2)

  # generate Y
  eta = -1.5 + (1.5*x1) + (0.85*x1^2) - (0.20*x1^3) + (2.5*I(x2<0)) + I(x2>3) + (0.3*x3)
  pi = exp(eta) / ( 1 + exp(eta) )
  y = rbinom(1500, 1, pi)

  # combine data and split into training and test data
  full = data.frame(y, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14)
  names(full) = c( "y" ,  paste( "x" , 1:14 , sep="" ) )

  return(full)
}

source( "GenerateData.R" )
FullData = GenerateData( 1035932 )
train = FullData[ 1:750 , ]
test = FullData[ 751:1500 , ]

For x1-x3

x = model.matrix( y ~ .-1 + x1 + x2 + x3, data=FullData )
y = FullData$y
nFullData = nrow(FullData)
###

###
nFolds = 10
set.seed( 500 )
CVfolds = sample( rep( 1:nFolds , length=nFullData ) , nFullData ) 
cv_errors_ridge = matrix( -9999 , 1 , nFolds )
cv_errors_lasso = matrix( -9999 , 1 , nFolds )

nFolds_inner = 10
for( fold in 1:nFolds )
{
  xtrain = x[ CVfolds!=fold , ]
  ytrain = y[ CVfolds!=fold ]
  xtest = x[ CVfolds==fold , ]
  ytest = y[ CVfolds==fold ]

  cv_ridge = cv.glmnet( xtrain , ytrain , alpha=0 , nfolds = nFolds_inner ) 
  cv_ridge_train = glmnet( xtrain , ytrain , alpha=0 , lambda=cv_ridge$lambda.min )
  predict_ridge_train = predict( cv_ridge_train , xtest )
  cv_errors_ridge[ fold ] = sqrt( apply( ( ytest - predict_ridge_train )^2 , 2 , mean ) )

  cv_lasso = cv.glmnet( xtrain , ytrain , alpha=1 , nfolds = nFolds_inner ) 
  cv_lasso_train = glmnet( xtrain , ytrain , alpha=1 , lambda=cv_lasso$lambda.min )
  predict_lasso_train = predict( cv_lasso_train , xtest )
  cv_errors_lasso[ fold ] = sqrt( apply( ( ytest - predict_lasso_train )^2 , 2 , mean ) )    
}

mean( cv_errors_ridge )
mean( cv_errors_lasso )

coef( cv.glmnet( x , y , alpha=1 , nfolds = nFolds ) )

For x1-x14

x = model.matrix( y ~ .-1 + x1 + x2 + x3 + x4 + x5 +x6 + x7 + x8 + x9 + x10 + x11
                  + x12 + x13 + x14, data=FullData )
y = FullData$y

###
nFolds = 10
set.seed( 500 )
CVfolds = sample( rep( 1:nFolds , length=nFullData ) , nFullData ) 
cv_errors_ridge = matrix( -9999 , 1 , nFolds )
cv_errors_lasso = matrix( -9999 , 1 , nFolds )

nFolds_inner = 10
for( fold in 1:nFolds )
{
  xtrain = x[ CVfolds!=fold , ]
  ytrain = y[ CVfolds!=fold ]
  xtest = x[ CVfolds==fold , ]
  ytest = y[ CVfolds==fold ]

  cv_ridge = cv.glmnet( xtrain , ytrain , alpha=0 , nfolds = nFolds_inner ) 
  cv_ridge_train = glmnet( xtrain , ytrain , alpha=0 , lambda=cv_ridge$lambda.min )
  predict_ridge_train = predict( cv_ridge_train , xtest )
  cv_errors_ridge[ fold ] = sqrt( apply( ( ytest - predict_ridge_train )^2 , 2 , mean ) )

  cv_lasso = cv.glmnet( xtrain , ytrain , alpha=1 , nfolds = nFolds_inner ) 
  cv_lasso_train = glmnet( xtrain , ytrain , alpha=1 , lambda=cv_lasso$lambda.min )
  predict_lasso_train = predict( cv_lasso_train , xtest )
  cv_errors_lasso[ fold ] = sqrt( apply( ( ytest - predict_lasso_train )^2 , 2 , mean ) )    
}

mean( cv_errors_ridge )
mean( cv_errors_lasso )

Tried another way now but still get the same values in the end for x1-x3 and x1-x14:

x <- model.matrix(y ~ .-1 + x1 + x2 + x3, data=FullData)
y <- as.double(as.matrix(FullData[, 1]))

# Fitting the model (Lasso: Alpha = 1)
set.seed(999)
cv.lasso <- cv.glmnet(x, y, family='binomial', alpha=1, parallel=TRUE, standardize=TRUE, type.measure='auc')

# Results
plot(cv.lasso)
plot(cv.lasso$glmnet.fit, xvar="lambda", label=TRUE)
cv.lasso$lambda.min
cv.lasso$lambda.1se
coef(cv.lasso, s=cv.lasso$lambda.min)


############################################################

x <- model.matrix(y ~ .-1 + x1 + x2 + x3 +x4 + x5 + x6 + x7
                  + x8 +x9 + x10 + x11 + x12 + x13 + x14, data=FullData)
y <- as.double(as.matrix(FullData[, 1]))

# Fitting the model (Lasso: Alpha = 1)
set.seed(999)
cv.lasso <- cv.glmnet(x, y, family='binomial', alpha=1, parallel=TRUE, standardize=TRUE, type.measure='auc')

# Results
plot(cv.lasso)
plot(cv.lasso$glmnet.fit, xvar="lambda", label=TRUE)
cv.lasso$lambda.min
cv.lasso$lambda.1se
coef(cv.lasso, s=cv.lasso$lambda.min)
Xi Ling
  • 11
  • 2
  • 1
    Can you please include data that will provide us with a [reproducible example](http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) ? My guess is that the lasso is discarding (i.e. shrinking to zero) all or most of your variables ... you should be able to check that easily ... (also, I would *strongly* recommend filling your matrices with `NA` rather than -9999 ...) – Ben Bolker Dec 07 '16 at 21:22
  • Added the data @BenBolker – Xi Ling Dec 07 '16 at 21:37

0 Answers0