2

I've received a great answer of how to substitute a formula. I need it for matching a list of data.frames.

When matching with MatchIt::matchit(), first, I have to save the result as a matchit.full / matchit class. Second, with match.data() data.frames w/ only matched observations are to be created.

The issue appears in the second step which is working fine when I use a formula as usual. For substitution now it seems that match.data() needs to recognize the formula somehow but it won't.

Consider this as an example (the warnings can be ignored):

# example list
library(car)
WeightLoss1 <- WeightLoss
WeightLoss1$group <- as.integer(ifelse(WeightLoss1$group == "Control", 0, 1))

WL = list(WeightLoss1, WeightLoss1, WeightLoss1)  # doesn't make much sense, but suffices for example

# substitute formula
wl.cov <- c("wl1", "se1")
WL.FM <- reformulate(wl.cov, response = "group")

# matching w/o substitution
m.match.0 <- lapply(1:length(WL), function(mark) {
  require(MatchIt)
  matchit(group ~ wl1 + se1, data = WL[[mark]])
})  

# matching w/ substitution
m.match.1 <- lapply(1:length(WL), function(mark) {
  require(MatchIt)
  matchit(WL.FM, data = WL[[mark]])
})

# now compare both attempts to create list of data.frames
# w/o
match <- lapply(1:length(m.match.0), function(i){
  require(MatchIt)
  match.data(m.match.0[[i]])
})

# w/
match <- lapply(1:length(m.match.1), function(i){
  require(MatchIt)
  match.data(m.match.1[[i]])
})

As can be seen attempt w/o substitution is working fine, attempt w/ substitution produces an error Error in eval(object$call$data, envir = env) : object 'mark' not found.

How could this be patched?

--

Notes:

> match.data
function (object, group = "all", distance = "distance", weights = "weights", 
    subclass = "subclass") 
{
    if (!is.null(object$model)) {
        env <- attributes(terms(object$model))$.Environment
    }
    else {
        env <- parent.frame()
    }
    data <- eval(object$call$data, envir = env)
    treat <- object$treat
    wt <- object$weights
    vars <- names(data)
    if (distance %in% vars) 
        stop("invalid input for distance. choose a different name.")
    else if (!is.null(object$distance)) {
        dta <- data.frame(cbind(data, object$distance))
        names(dta) <- c(names(data), distance)
        data <- dta
    }
    if (weights %in% vars) 
        stop("invalid input for weights. choose a different name.")
    else if (!is.null(object$weights)) {
        dta <- data.frame(cbind(data, object$weights))
        names(dta) <- c(names(data), weights)
        data <- dta
    }
    if (subclass %in% vars) 
        stop("invalid input for subclass. choose a different name.")
    else if (!is.null(object$subclass)) {
        dta <- data.frame(cbind(data, object$subclass))
        names(dta) <- c(names(data), subclass)
        data <- dta
    }
    if (group == "all") 
        return(data[wt > 0, ])
    else if (group == "treat") 
        return(data[wt > 0 & treat == 1, ])
    else if (group == "control") 
        return(data[wt > 0 & treat == 0, ])
    else stop("error: invalid input for group.")
}
<bytecode: 0x00000000866125e0>
<environment: namespace:MatchIt>

 

> matchit
function (formula, data, method = "nearest", distance = "logit", 
    distance.options = list(), discard = "none", reestimate = FALSE, 
    ...) 
{
    mcall <- match.call()
    if (is.null(data)) 
        stop("Dataframe must be specified", call. = FALSE)
    if (!is.data.frame(data)) {
        stop("Data must be a dataframe", call. = FALSE)
    }
    if (sum(is.na(data)) > 0) 
        stop("Missing values exist in the data")
    ischar <- rep(0, dim(data)[2])
    for (i in 1:dim(data)[2]) if (is.character(data[, i])) 
        data[, i] <- as.factor(data[, i])
    if (!is.numeric(distance)) {
        fn1 <- paste("distance2", distance, sep = "")
        if (!exists(fn1)) 
            stop(distance, "not supported.")
    }
    if (is.numeric(distance)) {
        fn1 <- "distance2user"
    }
    fn2 <- paste("matchit2", method, sep = "")
    if (!exists(fn2)) 
        stop(method, "not supported.")
    tryerror <- try(model.frame(formula), TRUE)
    if (distance %in% c("GAMlogit", "GAMprobit", "GAMcloglog", 
        "GAMlog", "GAMcauchit")) {
        requireNamespace("mgcv")
        tt <- terms(mgcv::interpret.gam(formula)$fake.formula)
    }
    else {
        tt <- terms(formula)
    }
    attr(tt, "intercept") <- 0
    mf <- model.frame(tt, data)
    treat <- model.response(mf)
    X <- model.matrix(tt, data = mf)
    if (method == "exact") {
        distance <- out1 <- discarded <- NULL
        if (!is.null(distance)) 
            warning("distance is set to `NULL' when exact matching is used.")
    }
    else if (is.numeric(distance)) {
        out1 <- NULL
        discarded <- discard(treat, distance, discard, X)
    }
    else {
        if (is.null(distance.options$formula)) 
            distance.options$formula <- formula
        if (is.null(distance.options$data)) 
            distance.options$data <- data
        out1 <- do.call(fn1, distance.options)
        discarded <- discard(treat, out1$distance, discard, X)
        if (reestimate) {
            distance.options$data <- data[!discarded, ]
            distance.options$weights <- distance.options$weights[!discarded]
            tmp <- out1
            out1 <- do.call(fn1, distance.options)
            tmp$distance[!discarded] <- out1$distance
            out1$distance <- tmp$distance
        }
        distance <- out1$distance
    }
    if (fn1 == "distance2mahalanobis") {
        is.full.mahalanobis <- TRUE
    }
    else {
        is.full.mahalanobis <- FALSE
    }
    out2 <- do.call(fn2, list(treat, X, data, distance = distance, 
        discarded, is.full.mahalanobis = is.full.mahalanobis, 
        ...))
    if (fn1 == "distance2mahalanobis") {
        distance[1:length(distance)] <- NA
        class(out2) <- c("matchit.mahalanobis", "matchit")
    }
    out2$call <- mcall
    out2$model <- out1$model
    out2$formula <- formula
    out2$treat <- treat
    if (is.null(out2$X)) {
        out2$X <- X
    }
    out2$distance <- distance
    out2$discarded <- discarded
    nn <- matrix(0, ncol = 2, nrow = 4)
    nn[1, ] <- c(sum(out2$treat == 0), sum(out2$treat == 1))
    nn[2, ] <- c(sum(out2$treat == 0 & out2$weights > 0), sum(out2$treat == 
        1 & out2$weights > 0))
    nn[3, ] <- c(sum(out2$treat == 0 & out2$weights == 0 & out2$discarded == 
        0), sum(out2$treat == 1 & out2$weights == 0 & out2$discarded == 
        0))
    nn[4, ] <- c(sum(out2$treat == 0 & out2$weights == 0 & out2$discarded == 
        1), sum(out2$treat == 1 & out2$weights == 0 & out2$discarded == 
        1))
    dimnames(nn) <- list(c("All", "Matched", "Unmatched", "Discarded"), 
        c("Control", "Treated"))
    out2$nn <- nn
    return(out2)
}
<bytecode: 0x0000000086d6e158>
<environment: namespace:MatchIt>
jay.sf
  • 60,139
  • 8
  • 53
  • 110

1 Answers1

3

Firstly note that it is not the subsittution that is the key difference between the two scenarios but the fact that in the non-substitution case the code defines the formula in function that calls matchit whereas in the substitution case it defines the formula outside that function. In both cases it would fail if the formula were defined outside the function and in both case it would work if the formula were defined inside the function.

The problem is that because the formula was defined outside the function the environment of the formula in the example is the global environment

environment(WL.FM)
## <environment: R_GlobalEnv>

whereas we want it to be the local environment in the anonymous function where it is used.

1) Try this:

m.match.1 <- lapply(WL, function(x) {
     WL.FM <- reformulate(wl.cov, response = "group")
     matchit(WL.FM, data = x)
})
match <- lapply(m.match.1, match.data)

2) or if you don't want to define the formula in the function try this alternative:

WL.FM <- reformulate(wl.cov, response = "group")
m.match.1 <- lapply(WL, function(x) {
     environment(WL.FM) <- environment()
     matchit(WL.FM, data = x)
})
match <- lapply(m.match.1, match.data)

2a) Another way to reset the environment is to convert the formula to character and then back to formula:

WL.FM <- reformulate(wl.cov, response = "group")
m.match.1 <- lapply(WL, function(x) {
     WL.FM <- formula(format(WL.FM))
     matchit(WL.FM, data = x)
})
match <- lapply(m.match.1, match.data)

3) Yet another appraoch is to define the WL.FM as a character string rather than as a formula object. Then it has no environment. Convert it to a formula in the anonymous function in which case its enviornment will default there:

WL.FM <- format(reformulate(wl.cov, response = "group")) # character   
m.match.1 <- lapply(WL, function(x) matchit(formula(WL.FM), data = x))
match <- lapply(m.match.1, match.data)

Note: Although not related to the key problem, from a style point of view, in the above we have removed the require statements. Use a single library statement at the top of your code and do not use require unless it is within an if statement. -- if (require(...)) ... You want the code to fail at the earliest possible point if the package to load is not available.

Also we have changed the lapply code to iterate over WL and m.match.1 rather than iterate over a subscript in each case.

G. Grothendieck
  • 254,981
  • 17
  • 203
  • 341