0

I want to extract not centered term-wise linear predictors for a GLM model on test data that contains factors.

I already tried to write a function GLMlinearpredictors that succeeds to do this in two cases:

  • The data contains no factors

  • The data contains all possible levels of the existing factors (see example linearpredictors2 below)

The function GLMlinearpredictors doesn't work in the following case:

  • The data contains factors, but doesn't contain all possible levels of the existing factors (see example linearpredictors1 below)
library(magrittr)

# Generate example data about subscriptions
traindata = data.frame(
  y = c(2, 2, 2, 1, 1, 1, 1, 1, 1),
  subscriptionyear = c(1, 2, 3, 1, 2, 2, 3, 3, 3),
  subscriptiontype = c(
    "premium",
    "standard",
    "premium",
    "premium",
    "standard",
    "premium",
    "premium",
    "standard",
    "premium"
  )
)

# The train data should contain a factor
traindata$subscriptiontype_asfactor <- as.factor(traindata$subscriptiontype)


# Train the model
model = glm(
  y ~ ifelse(subscriptionyear == 1, 1, 0) + ifelse(subscriptionyear == 2, 1, 0) + subscriptiontype_asfactor,
  family = 'Gamma', # Works also for poisson
  data = traindata
)

# Reduce the model size (for saving models as small files)
reduce_model_size = function(cm) {
  cm$y = c()
  cm$model = cm$model[0,]
  cm$residuals = c()
  cm$fitted.values = c()
  cm$effects = c()
  cm$qr$qr = c()
  #cm$linear.predictors = c()
  cm$weights = c()
  cm$prior.weights = c()
  cm$data = c()
  cm$family$variance = c()
  cm$family$dev.resids = c()
  cm$family$aic = c()
  cm$family$validmu = c()
  cm$family$simulate = c()
  attr(cm$formula,".Environment") = c()
  
  cm
}

model<-reduce_model_size(model)

# Remove the train data
rm(traindata)






# Function that needs to be improved to be able extract not centered term-wise linear predictors for a GLM model on test data that contains factors
GLMlinearpredictors <- function(model, testdata){
  # Extract the linear predictors for a test data set
  transformations <- attr(model$terms, 'variables')
  coefficientvalues <- list()
  transformationnames <- as.list(transformations) %>%
    sapply(toString) %>%
    extract(-1) # Do not remove intercept
  responsevariablename <- with(attributes(terms(model)), as.character(variables[response+1]))
  testdata[responsevariablename] <- 0
  #contrastsfactors <- lapply(testdata[, sapply(testdata, is.factor), drop = FALSE],
  #                           contrasts, contrasts = FALSE)
  #testdatamodelframe <- model.frame(model,data=testdata)
  outofsamplemodelmatrix <- model.matrix(model,data=testdata) # Tried using outofsamplemodelframe and contrasts.arg = contrastsfactors
  beta <- model$coef
  linearpredictors <- t(beta * t(outofsamplemodelmatrix))
  linearpredictors
}

# Working example with data frame that contains multiple observations/rows
testdata2 = data.frame(subscriptionyear = c(1, 1), subscriptiontype = c("premium","standard"))
testdata2$subscriptiontype_asfactor <- as.factor(testdata2$subscriptiontype)
testdata2$prediction <- exp(predict.glm(model, newdata = testdata2))
linearpredictors2 <- GLMlinearpredictors(model, testdata2)



# Not working example with data frame that contains one observation/one row
testdata1 = data.frame(subscriptionyear = c(1), subscriptiontype = c("premium"))
testdata1$subscriptiontype_asfactor <- factor(testdata1$subscriptiontype, levels = c("premium", "standard"))
testdata1$prediction <- exp(predict.glm(model, newdata = testdata1))
linearpredictors1 <- GLMlinearpredictors(model, testdata1)

The correct output of linearpredictors2 is:

> head(linearpredictors2)
  (Intercept) ifelse(subscriptionyear == 1, 1, 0) ifelse(subscriptionyear == 2, 1, 0) subscriptiontype_asfactorstandard
1   0.8091098                          -0.1424431                                   0                        0.00000000
2   0.8091098                          -0.1424431                                   0                       -0.03524849
> dput(linearpredictors2)
structure(c(0.809109769979335, 0.809109769979335, -0.142443103312668, 
-0.142443103312668, 0, 0, 0, -0.0352484912267524), dim = c(2L, 
4L), dimnames = list(c("1", "2"), c("(Intercept)", "ifelse(subscriptionyear == 1, 1, 0)", 
"ifelse(subscriptionyear == 2, 1, 0)", "subscriptiontype_asfactorstandard"
)), assign = 0:3, contrasts = list(subscriptiontype_asfactor = "contr.treatment"))

The expected output of linearpredictors1 is the same as the first row of linearpredictors2, but currently the function GLMlinearpredictors does not work for that case. Instead I get the following error:

Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : 
contrasts can be applied only to factors with 2 or more levels

The following questions seem related, but don't seem to solve my question:

  1. Make a model matrix if missing the response variable and where matrix multiplication recreates the predict function
  2. Is there a way to produce predict.gam(..., type="terms") values that are NOT centered
  3. What does predict.glm(, type="terms") actually do?
  4. How to debug "contrasts can be applied only to factors with 2 or more levels" error?
  5. R: apply model coefficients across variables in model, a better way?
  6. How to score uncenterized term-wise prediction using predict.glm()

0 Answers0