Take mean of certain values based on groups in R

Question

I have a dataset contaning return values of cryptocurrency ret.daily, small part of it looks like this

            Bitcoin     Ethereum    XRP      Bitcoin.Cash    Bitcoin.SV    ...
2018-01-01  -0.04       0.02        0.04     -0.04           NA
2018-01-02  0.09        0.13        0.04     0.11            NA
2018-01-03  0.01        0.08        0.23     -0.04           NA
   ...

I have then given each coin into one of 5 groups for each day, based on ceratin values price.groups.daily (these are just the biggest coins i included, the are many other coins, so there are coins in each of the 5 groups)

            Bitcoin     Ethereum    XRP      Bitcoin.Cash    Bitcoin.SV    ...
2018-01-01  5           5           4        5               NA
2018-01-02  5           5           4        5               NA
2018-01-03  5           5           4        5               NA
   ...

What I then want to do is to take the mean of each group for each day, and make a new matrix, looking like this

            1                         2                       3    4    5                        
2018-01-01  Mean(groups 1 numbers)    Mean(groups 2 numbers)  ...  ...  mean(-0.04, 0.02,-0.04,...)
2018-01-02  Mean(groups 1 numbers)    Mean(groups 2 numbers)  
2018-01-03  Mean(groups 1 numbers)    Mean(groups 2 numbers)  
   ...

When i made the grouping, I did the following (where price.daily is daily price data, which is what i used to sort the data into groups)

col.daily <- seq(1,length(price.daily$Bitcoin))
quantile.daily = sapply(col.daily, function(y) {quantile(x = unlist(price.daily[y,] ), seq(0,1, length=6),na.rm = TRUE )})
quantile.daily.t = t(quantile.daily)
rownames(quantile.daily.t) = rownames(price.daily)

combined.daily = cbind(price.daily, quantile.daily.t)
price.groups.daily = as.data.frame(t(apply(combined.daily, 1, function(x) findInterval(x[1:ncol(price.daily)], x[(1 + ncol(price.daily)):ncol(combined.daily)]))))
colnames(price.groups.daily) = colnames(price.daily)
price.groups.daily[price.groups.daily == 6] = 5

I added the last line like that, since i didnt know how to get around if the biggest values was equal to the end interval in the last group, but this works just fine. I imagine this could also be done using some apply function, i am just not certain how, since before i could use function such as Quantile, and findInterval which did exactly what i wanted to do. Not sure if there is a function that could work in this scenario?

EDIT : Added some of my data using dput(head(price.groups.daily[1:5])) (my data starts in 2014, but i started from 2018 in my example, since most coins didnt exist at that time)

structure(list(Bitcoin = c(5, 5, 5, 5, 5, 5), Ethereum = c(NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), XRP = c(1L, 1L, 1L, 1L, 1L, 2L), Bitcoin.Cash = c(NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), Bitcoin.SV = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_)), row.names = c("2014-01-01", "2014-01-02", 
"2014-01-03", "2014-01-04", "2014-01-05", "2014-01-06"), class = "data.frame")

and for > dput(head(ret.daily[1:5]))

structure(list(Bitcoin = c(0.0201473710988784, 0.048620314369761, 
0.0826106401572204, 0.0209460599834816, -0.17281055170073, 0.0495261478685647
), Ethereum = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_), XRP = c(-0.0390090806022911, 0.0180075172268452, -0.108767309981219, 
0.0184572292482077, -0.111605656954607, 0.0104300601469132), 
    Bitcoin.Cash = c(NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_), Bitcoin.SV = c(NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_)), row.names = c("2014-01-03", 
"2014-01-04", "2014-01-05", "2014-01-06", "2014-01-07", "2014-01-08"
), class = "data.frame")

score 0 · Answer 1 · answered Apr 16 '20 at 02:34

You could have a look at data.table or various tidyverse functions to accomplish that. Below is an example using data.table:

library(data.table)
library(Hmisc)

## prepare example data
set.seed(1)
dts <- seq.Date(
        from = as.Date("2019/1/1"),
        to = as.Date("2020/1/1"),
        by = 1
    )

ret.daily <- matrix(
    rnorm(length(dts) * 50, 0, 6),
    ncol = 50,
    dimnames = list(
        dts,
        c("Bitcoin", "Ethereum", "XRP", "Bitcoin.Cash", "Bitcoin.SV", 
          paste0("coin_", sprintf("%02d", seq_len(45))))
    ))

ret.daily[sample(seq_len(length(ret.daily)), 200 )] <- NA # add some NA's
ret.daily <- data.frame(ret.daily)

## start of summarizations
ret.daily <- melt(data.table(date = as.character(dts), ret.daily), id.vars = "date")
setkey(ret.daily, date, variable)

cuts <- ret.daily[, .(as.list(
    Hmisc::cut2(value, g = 6)
)), by = .(date)]
setkey(cuts, date)

# grouping based on daily percentiles (in long format)
ret.daily[, group := unlist(lapply(cuts$V1, as.numeric))][]
#>              date     variable      value group
#>     1: 2019-01-01      Bitcoin -3.7587229     2
#>     2: 2019-01-01     Ethereum  4.0700411     5
#>     3: 2019-01-01          XRP -6.3744503     1
#>     4: 2019-01-01 Bitcoin.Cash -4.5996998     2
#>     5: 2019-01-01   Bitcoin.SV -4.9012655     2
#>    ---                                         
#> 18296: 2020-01-01      coin_41 -4.1377852     2
#> 18297: 2020-01-01      coin_42 -0.7649347     3
#> 18298: 2020-01-01      coin_43  0.7698973     4
#> 18299: 2020-01-01      coin_44 -4.6674720     2
#> 18300: 2020-01-01      coin_45 -3.6291231     2

# summarize mean by group and date, and casting the data into wide format
dcast(ret.daily[, .(mean = mean(value, na.rm = TRUE)), by = .(date, group)],
      date ~ group, value.var = "mean")
#>            date  NA         1         2          3         4        5         6
#>   1: 2019-01-01  NA -8.284783 -4.173707 -0.9096477 1.3175870 4.501497 11.123123
#>   2: 2019-01-02  NA -7.379199 -4.502193 -2.1457718 1.1179902 4.207471  8.069149
#>   3: 2019-01-03 NaN -9.070030 -4.708133 -1.8032877 0.9011769 2.699407  7.673678
#>   4: 2019-01-04  NA -7.019294 -2.995686 -0.9035496 1.6644289 4.565588  9.178561
#>   5: 2019-01-05  NA -9.457924 -3.957598 -1.9535285 0.3493898 3.265330  7.396461
#>  ---                                                                           
#> 362: 2019-12-28  NA -9.866193 -4.481655 -2.2775438 1.0612454 3.863716  9.159870
#> 363: 2019-12-29  NA -8.555226 -3.319358 -0.6815004 1.5801415 4.379455  9.354069
#> 364: 2019-12-30  NA -7.430636 -4.011801 -1.3067570 2.2528401 4.805392 10.595387
#> 365: 2019-12-31  NA -7.316091 -2.784448 -0.8047659 0.7121429 3.508579  7.714213
#> 366: 2020-01-01 NaN -8.502224 -4.369027 -1.7029667 0.5042703 3.959396  9.084915

^{Created on 2020-04-15 by the reprex package (v0.3.0)}

I am not sure what you make your groups on, i already have the groups i need in price.groups.daily shown above. but i tried this, but i get an error when trying to do the "cuts" line : Error in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]) : there is no package called ‘latticeExtra’ — Lasse.Jen, Apr 16 '20 at 06:36
Regarding Hmisc see `https://stackoverflow.com/questions/59419647/hmisc-package-or-namespace-failed-to-load-no-package-called-latticeextra` for the cause of the problem and possible solutions. The groups are 5 quantile groups for each date, across the variables (currencies). — user12728748, Apr 16 '20 at 11:34

Take mean of certain values based on groups in R

1 Answers1