16

I have a data frame with 2 columns and 3659 row df

I am trying to reduce the data set by averaging every 10 or 13 rows in this data frame, so I tried the following :

# number of rows per group
n=13
# number of groups
n_grp=nrow(df)/n
round(n_grp,0)
# row indices (one vector per group)
idx_grp <- split(seq(df), rep(seq(n_grp), each = n))

# calculate the col means for all groups
res <- lapply(idx_grp, function(i) {
  # subset of the data frame
  tmp <- dat[i]
  # calculate row means
  colMeans(tmp, na.rm = TRUE)
})
# transform list into a data frame
dat2 <- as.data.frame(res)

However, I can't divide my number of rows by 10 or 13 because data length is not a multiple of split variable. So I am not sure what should do then (I just want may be to calculate the mean of the last group -even with less than 10 elements)

I also tried this one, but the results are the same:

df1=split(df, sample(rep(1:301, 10)))
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
yuliaUU
  • 1,581
  • 2
  • 12
  • 33
  • Someone asked a similar question today. Combine `split` and `cut`. See if this helps http://stackoverflow.com/questions/30356275/r-divide-data-into-groups – Pierre L May 20 '15 at 20:16
  • the problem that i cant split by the number of groups i want. cause i have odd number of columns and i want to have 10 groups for instance – yuliaUU May 20 '15 at 20:23

4 Answers4

17

Here's a solution using aggregate() and rep().

df <- data.frame(a=1:12, b=13:24 );
df;
##     a  b
## 1   1 13
## 2   2 14
## 3   3 15
## 4   4 16
## 5   5 17
## 6   6 18
## 7   7 19
## 8   8 20
## 9   9 21
## 10 10 22
## 11 11 23
## 12 12 24
n <- 5;
aggregate(df, list(rep(1:(nrow(df) %/% n + 1), each = n, len = nrow(df))), mean)[-1];
##      a    b
## 1  3.0 15.0
## 2  8.0 20.0
## 3 11.5 23.5

The important part of this solution that handles the issue of non-divisibility of nrow(df) by n is specifying the len parameter (actually the full parameter name is length.out) of rep(), which automatically caps the group vector to the appropriate length.

Henrik
  • 65,555
  • 14
  • 143
  • 159
bgoldst
  • 34,190
  • 6
  • 38
  • 64
  • What does the [-1] do at the end? Remove the extra col added? This is weirdly not working for me, am using a for loop with a list of DFs. – Garglesoap Jul 09 '18 at 06:01
10

If df is a data.table, you can use %/% to group as in

library(data.table)
setDT(df)
n <- 13 # every 13 rows

df[, mean(z), by= (seq(nrow(df)) - 1) %/% n]

if instead you want every nTH row, use %% instead of %/%

df[, mean(z), by= (seq(nrow(df)) - 1) %% n]
Ricardo Saporta
  • 54,400
  • 17
  • 144
  • 178
  • I love the programmatic by in data.table! Just a note, `(seq(nrow(df))` can be replaced by `(1:.N)` for a more data.table way, and that would also work if the whole thing was wrapped in a larger grouping, eg `df[,by = my.group, .SD[, by = (1:.N - 1) %/% n, mean (z)] ]` – JVP Mar 09 '19 at 14:32
7

This should work. Use n = 13 for clumping 13 rows together. If you have 27 rows, you'll get groups of size 13, 13, 1.

n.colmeans = function(df, n = 10){
    aggregate(x = df,
              by = list(gl(ceiling(nrow(df)/n), n)[1:nrow(df)]),
              FUN = mean)
}

n.colmeans(state.x77, 10)

  Group.1 Population Income Illiteracy Life Exp Murder HS Grad Frost     Area
1       1     4892.8 4690.8       1.44   70.508   9.53   53.63  75.1 116163.6
2       2     3570.5 4419.4       1.12   71.110   7.07   53.35  99.8  44406.6
3       3     3697.9 4505.5       0.93   70.855   6.64   55.25 131.7  60873.0
4       4     5663.9 4331.2       1.33   70.752   7.12   49.59 103.6  56949.5
5       5     3407.0 4232.1       1.03   71.168   6.53   53.72 112.1  75286.7
Vlo
  • 3,168
  • 13
  • 27
2

dplyr way

n1 <- 10
iris %>% group_by(mean = (row_number() -1) %/% n1) %>%
  mutate(mean = mean(Sepal.Length))

# A tibble: 150 x 6
# Groups:   mean [15]
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species  mean
          <dbl>       <dbl>        <dbl>       <dbl> <fct>   <dbl>
 1          5.1         3.5          1.4         0.2 setosa   4.86
 2          4.9         3            1.4         0.2 setosa   4.86
 3          4.7         3.2          1.3         0.2 setosa   4.86
 4          4.6         3.1          1.5         0.2 setosa   4.86
 5          5           3.6          1.4         0.2 setosa   4.86
 6          5.4         3.9          1.7         0.4 setosa   4.86
 7          4.6         3.4          1.4         0.3 setosa   4.86
 8          5           3.4          1.5         0.2 setosa   4.86
 9          4.4         2.9          1.4         0.2 setosa   4.86
10          4.9         3.1          1.5         0.1 setosa   4.86
# ... with 140 more rows

or if n1 is not a divisor or nrow(df) then also

n1 <- 7

# A tibble: 150 x 6
# Groups:   mean [21]
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species  mean
          <dbl>       <dbl>        <dbl>       <dbl> <fct>   <dbl>
 1          5.1         3.5          1.4         0.2 setosa    4.9
 2          4.9         3            1.4         0.2 setosa    4.9
 3          4.7         3.2          1.3         0.2 setosa    4.9
 4          4.6         3.1          1.5         0.2 setosa    4.9
 5          5           3.6          1.4         0.2 setosa    4.9
 6          5.4         3.9          1.7         0.4 setosa    4.9
 7          4.6         3.4          1.4         0.3 setosa    4.9
 8          5           3.4          1.5         0.2 setosa    4.8
 9          4.4         2.9          1.4         0.2 setosa    4.8
10          4.9         3.1          1.5         0.1 setosa    4.8
# ... with 140 more rows

You can also mutate across a number of columns

mydf <- iris[-5]

mydf %>% group_by(n = (row_number() -1) %/% n1) %>%
  mutate(across(everything(), ~ mean(.), .names = "{.col}_mean"))

# A tibble: 150 x 9
# Groups:   n [22]
   Sepal.Length Sepal.Width Petal.Length Petal.Width     n Sepal.Length_me~ Sepal.Width_mean Petal.Length_me~
          <dbl>       <dbl>        <dbl>       <dbl> <dbl>            <dbl>            <dbl>            <dbl>
 1          5.1         3.5          1.4         0.2     0              4.9             3.39             1.44
 2          4.9         3            1.4         0.2     0              4.9             3.39             1.44
 3          4.7         3.2          1.3         0.2     0              4.9             3.39             1.44
 4          4.6         3.1          1.5         0.2     0              4.9             3.39             1.44
 5          5           3.6          1.4         0.2     0              4.9             3.39             1.44
 6          5.4         3.9          1.7         0.4     0              4.9             3.39             1.44
 7          4.6         3.4          1.4         0.3     0              4.9             3.39             1.44
 8          5           3.4          1.5         0.2     1              4.8             3.21             1.43
 9          4.4         2.9          1.4         0.2     1              4.8             3.21             1.43
10          4.9         3.1          1.5         0.1     1              4.8             3.21             1.43
# ... with 140 more rows, and 1 more variable: Petal.Width_mean <dbl>
AnilGoyal
  • 25,297
  • 4
  • 27
  • 45