Multiple linear models on each column of dataframe

Question

I have a generic csv file with 8 columns (y,x1,...,x7), with y representing a response variable and the x's representing potential predictors, each with 100 observations.

My goals are to use R to...

1 - create 7 scatterplots of y vs each one of the x's.

2 - create a linear model of y with each of the x's.

Currently I just have it all typed out repetitively...

p9 <- ggplot(generic, aes(x1,y)) + geom_point() + labs(title = "y vs x1")
p10 <- ggplot(generic, aes(x2,y)) + geom_point() + labs(title = "y vs x2")
p11 <- ggplot(generic, aes(x3,y)) + geom_point() + labs(title = "y vs x3")
p12 <- ggplot(generic, aes(x4,y)) + geom_point() + labs(title = "y vs x4")
p13 <- ggplot(generic, aes(x5,y)) + geom_point() + labs(title = "y vs x5")
p14 <- ggplot(generic, aes(x6,y)) + geom_point() + labs(title = "y vs x6")
p15 <- ggplot(generic, aes(x7,y)) + geom_point() + labs(title = "y vs x7")
grid.arrange(p9,p10,p11,p12,p13,p14,p15,ncol = 3)

and

x1mod <- lm(generic$y~generic$x1)
x2mod <- lm(generic$y~generic$x2)
x3mod <- lm(generic$y~generic$x3)
x4mod <- lm(generic$y~generic$x4)
x5mod <- lm(generic$y~generic$x5)
x6mod <- lm(generic$y~generic$x6)
x7mod <- lm(generic$y~generic$x7)
summary(x1mod)
summary(x2mod)
summary(x3mod)
summary(x4mod)
summary(x5mod)
summary(x6mod)
summary(x7mod)

I would like to make this less repetitive. I attempted to accomplish this with a for loop but it turned into a bit of a mess. I also read up on mapping functions to data with purr but I couldn't quite figure out how to make it work for my situation as I am not trying to divide the data by any factor. I am relatively new to R so I apologize if my issue is laughably simple.

`pairs` will plot all possible pairs of columns, e.g. `pairs(anscombe)` using that builtin data set. — G. Grothendieck, Dec 01 '20 at 15:15

score 0 · Answer 1 · answered Dec 01 '20 at 14:15

You can use apply() at column level to build the models and ggplot2 combined with some tidyverse functions to reach your result. Here a code using dummy data:

library(ggplot2)
library(tidyverse)
#Randomness
set.seed(123)
#Data
df <- data.frame(y=rpois(15,0.8),
                 x1=rnorm(15,0,1),
                 x2=rnorm(15,1,1),
                 x3=rnorm(15,3,1),
                 x4=rnorm(15,4,1),
                 x5=rnorm(15,5,1),
                 x6=rnorm(15,6,1),
                 x7=rnorm(15,7,1))
#Models
Lmods <- apply(df[,-1],2,function(x) lm(y~x,data = df))
lapply(Lmods, summary)
#Plot
df %>% pivot_longer(-y) %>%
  mutate(name=paste0('y vs. ',name)) %>%
  ggplot(aes(x=value,y=y,color=name))+
  geom_point()+
  facet_wrap(name~.,scales = 'free')+
  theme(legend.position = 'none')

Output for plots:

score 0 · Answer 2 · answered Dec 01 '20 at 14:21

Here an example to create the lm.

You can start defining which is your y and your x's.

require(tidyverse)

# Get all col names
cols <- names(mtcars)

y <- "mpg"

# Exclude y from cols
cols <- cols[cols != y]

Here you define a function that will receive in input a string that represents an x, your y, and your dataset.

# Create a function that return a lm for each x you want
make_lm <- function(col, y, dataset){
  formula <- paste0(y, "~", col)
  return(lm(formula, data = dataset))
}

# This will return a list with a model for each col
l_model <- cols %>% map(make_lm, y, mtcars)

# Here you can get the summary for each model
l_model %>% map(summary)

The output will be a list with a model for each col in cols.

I kindly suggest using tidyverse for this kind of stuff.

score 0 · Answer 3 · answered Dec 01 '20 at 14:33

Given Data:

set.seed(1)
df <- as.data.frame(replicate(8, rnorm(100)))
names(df) <- c("y", paste0("x", 1:7))

You can do everything pretty neatly inside a tidyverse workflow, which is pretty cool.

# libraries
library(dplyr)
library(tidyr)
library(ggplot2)

df %>%
 tibble::rowid_to_column() %>% 
 pivot_longer(-c(rowid,y)) %>% 
 nest_by(name) %>% 
 summarise(plot = list(ggplot(data, aes(value,y)) + geom_point() + labs(title = paste("y vs", name))),
           lm = list(summary(lm(y ~ value, data))))

#> `summarise()` regrouping output by 'name' (override with `.groups` argument)
#> # A tibble: 7 x 3
#> # Groups:   name [7]
#>   name  plot   lm        
#>   <chr> <list> <list>    
#> 1 x1    <gg>   <smmry.lm>
#> 2 x2    <gg>   <smmry.lm>
#> 3 x3    <gg>   <smmry.lm>
#> 4 x4    <gg>   <smmry.lm>
#> 5 x5    <gg>   <smmry.lm>
#> 6 x6    <gg>   <smmry.lm>
#> 7 x7    <gg>   <smmry.lm>

Now you have plots and summaries all together in a dataframe and you can handle them the way you want.

Or a more traditional solution:

# libraries
library(dplyr)
library(tidyr)
library(ggplot2)

# reorganize data
df_lng <- df %>%
 tibble::rowid_to_column() %>% 
 pivot_longer(-c(rowid,y))

# lm results
df_lng %>% 
 nest_by(name) %>% 
 summarise(broom::tidy(lm(y ~ value, data)))

#> `summarise()` regrouping output by 'name' (override with `.groups` argument)
#> # A tibble: 14 x 6
#> # Groups:   name [7]
#>    name  term         estimate std.error statistic p.value
#>    <chr> <chr>           <dbl>     <dbl>     <dbl>   <dbl>
#>  1 x1    (Intercept)  0.109       0.0903   1.20      0.231
#>  2 x1    value       -0.000932    0.0947  -0.00984   0.992
#>  3 x2    (Intercept)  0.108       0.0903   1.20      0.233
#>  4 x2    value        0.0160      0.0877   0.182     0.856
#>  5 x3    (Intercept)  0.111       0.0903   1.23      0.221
#>  6 x3    value       -0.0457      0.0914  -0.500     0.618
#>  7 x4    (Intercept)  0.113       0.0893   1.27      0.208
#>  8 x4    value        0.113       0.0767   1.47      0.144
#>  9 x5    (Intercept)  0.104       0.0898   1.16      0.248
#> 10 x5    value       -0.102       0.0933  -1.10      0.276
#> 11 x6    (Intercept)  0.123       0.0915   1.35      0.181
#> 12 x6    value        0.0717      0.0836   0.857     0.393
#> 13 x7    (Intercept)  0.109       0.0902   1.21      0.230
#> 14 x7    value        0.0293      0.0826   0.355     0.723


# plots
ggplot(df_lng, aes(x = value, y = y, colour = name)) +
 geom_point() + 
 labs(title = "y vs x") +
 facet_wrap("name")

score 0 · Answer 4 · answered Dec 02 '20 at 13:22

reformulate() is an essential function included in the base package you should know. You may use it to easily create a formula for model fitting, especially useful in a multiple approach using lapply:

xes <- names(dat)[-8]  ## storing x variable names

## fitting all possible bivariate models and store them in a list
res <- setNames(lapply(xes, function(x) {
  fo <- reformulate(x, "y")  ##dynamically create formula
  do.call("lm", list(fo, quote(dat)))
}), xes)

^{Credits to @G.Grothendieck for this neat do.call approach which assigns clean call attributes to the result!}

## calculate summary accessing model X1
summary(res$X1)
# Call:
# lm(formula = y ~ X1, data = dat)
# 
# Residuals:
#      Min       1Q   Median       3Q      Max 
# -1.52761 -0.43187 -0.06834  0.49291  1.55059 
# 
# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)   2.8910     0.1321  21.889  < 2e-16 ***
# X1            1.0301     0.2185   4.714 8.04e-06 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 0.6565 on 98 degrees of freedom
# Multiple R-squared:  0.1848,  Adjusted R-squared:  0.1765 
# F-statistic: 22.22 on 1 and 98 DF,  p-value: 8.042e-06

You may access the formulas stored in the model calls and directly use them for plotting.

op <- par(mfrow=c(2, 4))  ## sets pars
lapply(xes, function(x) {
  plot(res[[x]]$call$formula, dat, main=paste("y vs", x))
  abline(res[[x]])  ## optional for regression line
 })
par(op)  ## restores pars

Note: If the formulas get more complicated we can use as.formula() (i.e. for models with random effects or instrumental variables that include another term after a |).

res2 <- setNames(lapply(xes, function(x) {
  fo <- as.formula(paste("y ~ ", x))
  do.call("lm", list(fo, quote(dat)))
}), xes)

stopifnot(all.equal(res1, res2))

Data:

m <- 100;n <- 7
set.seed(42)
dat <- data.frame(matrix(runif(m*n), m, n))
dat <- transform(dat, y=X1 + X2 + X3 + X4 + X5 + X6 + X7)

Multiple linear models on each column of dataframe

4 Answers4