Possible bias when fitting linear regression with errors in variables using TMB

Question

I am new to template model builder (TMB), and I am trying to use it to fit a simple linear regression with errors in variables as a learning exercise. My ultimate goal is to apply this to more complex models. I am using the formulation:

Formulae

and the followng code to simulate data and generate the analysis:

#
# The following code examines the error in variables case of simple linear
# regression using maximum likelihood.
#
#-------------- Prepare Workspace: clear and load packages --------------------
#
rm(list=ls())
library(MASS)         # for mvnorm()
library(TMB)          # for Template Model Builder (TMB)
library(tidyverse)

#-------------------- Define model parameters ---------------------------------
#
model_name <-"eiv_regression"
set.seed(42)       # random number seed to fix the sequence for replication
b0 <- 10.0         # regression intercept
b1 <- 2.0          # regression slope
sigma_Y <- 5.0     # observation error on Y
lambda <- 1        # observation error ratio: sigma_X/sigma_Y
                   #   sigma_X = lambda * sigma_Y

N <- 1000     # number of observations (use lots to test)

#-------------------- Create Simulated Data Set -------------------------------
#
# simulate observation errors in x and Y
epsilon <- rnorm(N,0,sigma_Y)          # errors in Y
delta  <- rnorm(N,0,(lambda*sigma_Y))  # errors in X
# Simulate observed values of predictor variable
X_true <- runif(N,20,80)
X_obs <- X_true + delta
# Simulate observations as "true" value + Y error
Y_true <- b0 + b1*X_true
Y_obs <- Y_true + epsilon
# Make sure our estimates look like we think they should
plot(X_obs,Y_obs,pch=19,col="grey")
# Add the line estimated if we ignore errors in variables
lm_model <- lm(Y_obs~X_obs)
abline(b0,b1,lwd=3)
abline(coef(lm_model)[1],coef(lm_model)[2], lwd=3, lty=2)
# Print "naive" parameter estimates
estTable <- as.data.frame(summary(lm_model)$coefficients)
rownames(estTable)[2] <- "X"
estTable$True <- c(b0,b1)
estTable <- estTable %>% 
  select(True,Estimate,`Std. Error`)
estTable
# autocorrelation in residuals?
acf(resid(lm_model))

#------------- Define Template Model as .CPP file -----------------------------

cat("
/*---------------------- eiv_regression.cpp --  -----------------------------*/
#include <TMB.hpp>
using Eigen::SparseMatrix;
using namespace density;

template<class Type>
Type objective_function<Type>::operator() ()
{
    // Data
    // Note: Transformations of raw data are performed in R
    DATA_VECTOR(Y);      // Response
    DATA_VECTOR(X);      // Predictor
    DATA_SCALAR(lambda)  // ratio sigma_X/sigma_Y as known

    // Book keeping 
    size_t N = Y.size();       // number of observations
                               // NOTE: use size_t for array indices

    // Parameters
    PARAMETER_VECTOR(beta);    // coefficients (intercept, slope)
    PARAMETER(log_sigma_Y);    // log sd for measurement errors of Y
                               //   logged to force positive values
    PARAMETER_VECTOR(del);     // vector of random observation errors for 
                               //   predictor sigma_X

    // Transform parameters to desired values for use and reporting
    Type sigma_Y = exp(log_sigma_Y);  // inverse log
    Type sigma_X = sigma_Y * lambda;

    // Initialize negative log likelihood (to minimize as optimization goal)
    Type nll = 0.;
    
    // X error: likelihood contribution of observation error in X
    nll -= sum(dnorm(del, Type(0), sigma_X, true));
    
    // PROCESS: predictions calculated as model (with predictor error)
    // Note: TMB array indices start at 0, R arrays start at 1
    vector<Type> Y_hat(N);
    for (size_t n=0; n<N; n++){                   // loop through all obs
      Y_hat(n) = beta(0) + beta(1)*(X(n) - del(n));
    }            
    
    // OBSERVATION: likelihood contribution of observed data calculated as
    //   predictions + observation errors. e_i ~ N(0,sigma_Y)
    for (size_t n=0; n<N; n++){                     // loop through all obs
      nll -= dnorm(Y(n), Y_hat(n), sigma_Y, true);
    }
    
    // Parameters to report to R, including Std. Error estimates based
    // on the Hessian and Delta Method
    ADREPORT(beta);        // regression coefficients
    ADREPORT(sigma_Y);     // sd estimate Y
    ADREPORT(sigma_X);     // sd estimate X
    ADREPORT(Y_hat);       // predicted values and their st.dev.
    
    return nll;            // return the negative log likelihood for the
                           // specific set of parameter values.
}
/*---------------------------------------------------------------------------*/
",file="eiv_regression.cpp")

#------------- Estimate model parameters via TMB ------------------------------
#
# Remove previous object file and DLL if present
if (file.exists(paste0(model_name, ".o"))) {
  file.remove(paste0(model_name,".o"))
}
if (file.exists(paste0(model_name, ".dll"))) {
  file.remove(paste0(model_name,".dll"))
}
#
# Compile Model
compile(paste0(model_name, ".cpp"))
#
# Load compiled model
dyn.load(dynlib(model_name))
#
# Define data list
Data <- list(Y=as.vector(Y_obs),       # Observed Response
             X=as.vector(X_obs),       # Observed Predictor
             lambda=lambda)            # sigma_X/sigma_Y assumption

# Define parameter list with initial values
Params <- list(beta=c(0,0),        # Regression coefficients
               log_sigma_Y=0,      # Y Observation Error (log)
               del=rep(0,N))       # errors in X

# Construct TMB function for optimization
Obj <- MakeADFun(data=Data, 
                 parameters=Params, 
                 DLL=model_name,
                 silent=T)
Obj$env$tracemgc <- FALSE
Obj$env$inner.control$trace <- FALSE

# Perform Optimization
Opt <- nlminb(start=Obj$par, objective=Obj$fn, gradient=Obj$gr,
              control=list(iter.max=1000, eval.max=500))
if(Opt$convergence != 0){
  print("Model had partial or non_convergence!")
}
# 
# Likelihood profiles to assess estimation performance
par(mfrow=c(3,1))
nEstPar <- 3
for (i in 1:nEstPar) {
  plot(tmbprofile(Obj,i, trace=0))
}
#
# Produce Report
Report <- sdreport(Obj)
ReportTable <- as.data.frame(summary(Report,"report")[1:4,])
rownames(ReportTable)[1:2] <- c("intercept","slope")
# Add CIs
ReportTable$CI95low <- ReportTable[,1] - ReportTable[,2]*1.96
ReportTable$CI95high <- ReportTable[,1] + ReportTable[,2]*1.96
# Add true values
ReportTable$True <- c(b0,b1,sigma_Y,lambda*sigma_Y)
# Add OLS values
OLS <- summary(lm(Y_obs~X_obs))
ReportTable$OLS <- c(OLS$coefficients[1:2,1],OLS$sigma,NA)
options(digits=3)
ReportTable
#
# Housekeeping: Unload model *.dll when done
dyn.unload(dynlib(model_name))

The "true"slope is 2. When I run this code I get an improvement fro the OLS slope estimate of 1.81, but it is still less than 2 (C.I. 95% = 1.92 to 1.98). I am not sure why this bias remains, and thought I would try to ask the community. I have placed my question in Stack Overflow as I suspect it is a coding error. Any suggestions are appreciated.

Possible bias when fitting linear regression with errors in variables using TMB

0 Answers0