-1

This is the top of my dataset:

state start_date   end_date    created_at cycle party answer candidate_name  pct survey_length
1      Florida 2020-11-02 2020-11-02 6/14/21 15:36  2020   REP  Trump   Donald Trump 48.0        0 days
2         Iowa 2020-11-01 2020-11-02 11/2/20 09:02  2020   REP  Trump   Donald Trump 48.0        1 days
3 Pennsylvania 2020-11-01 2020-11-02 11/2/20 12:49  2020   REP  Trump   Donald Trump 49.2        1 days
4      Florida 2020-11-01 2020-11-02 11/2/20 19:02  2020   REP  Trump   Donald Trump 48.2        1 days
5      Florida 2020-10-31 2020-11-02 11/4/20 09:17  2020   REP  Trump   Donald Trump 49.4        2 days
6       Nevada 2020-10-31 2020-11-02 11/4/20 10:38  2020   REP  Trump   Donald Trump 49.1        2 days

I want to take the average of the 'pct' column, for each month, for each state.

I can filter the data individually and use aggregate() like this:

Alabama <- filter(prep2020, prep2020$state == 'Alabama')
  Alabama$end_date <- format(Alabama$end_date, '%m')
  AL <- aggregate(Alabama$pct, by=list(Alabama$end_date), mean)

I think the best way would be to write a function that does this for all states, and then use the function in lapply() but I can't seem to figure out how to do that. Any suggestions?

easyier
  • 63
  • 2

2 Answers2

1

The aggregate() function can do this automatically. To illustrate, I use the mtcars dataset and take the mean of mpg for each combination of the am and cyl variables:

head(mtcars)
    ##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
    ## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
    ## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
    ## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
    ## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
    ## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
    ## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

aggregate(mpg ~ am + cyl, data = mtcars, FUN = mean)
    ##   am cyl      mpg
    ## 1  0   4 22.90000
    ## 2  1   4 28.07500
    ## 3  0   6 19.12500
    ## 4  1   6 20.56667
    ## 5  0   8 15.05000
    ## 6  1   8 15.40000

data.table solution

library(data.table)
dat = data.table(mtcars)
dat[, .(mpg = mean(mpg)), by = c("am", "cyl")]
    ##    am cyl      mpg
    ## 1:  1   6 20.56667
    ## 2:  1   4 28.07500
    ## 3:  0   6 19.12500
    ## 4:  0   8 15.05000
    ## 5:  0   4 22.90000
    ## 6:  1   8 15.40000

dplyr solution

library(dplyr)
mtcars |>
    group_by(am, cyl) |>
    summarize(mpg = mean(mpg))

    ## # A tibble: 6 × 3
    ## # Groups:   am [2]
    ##      am   cyl   mpg
    ##   <dbl> <dbl> <dbl>
    ## 1     0     4  22.9
    ## 2     0     6  19.1
    ## 3     0     8  15.0
    ## 4     1     4  28.1
    ## 5     1     6  20.6
    ## 6     1     8  15.4
Vincent
  • 15,809
  • 7
  • 37
  • 39
1

You can do this:

library(data.table)

setDT(prep2020)[, .(mean_pct = mean(pct, na.rm=T)), by=.(state, month=month(end_date))]

However, if you want to write a function using the approach you started with, you can do something like this

f = function(s) {
  sdf <- filter(prep2020, prep2020$state == s)
  sdf$end_date <- format(sdf$end_date, '%m')
  aggregate(sdf$pct, by=list(sdf$end_date), mean)
}

lapply(unique(prep2020$state), f)
langtang
  • 22,248
  • 1
  • 12
  • 27