0

I have 2 dataset

data1=structure(list(x1 = c(7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L), x2 = c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), x3 = c(1L, 1L, 1L, 2L, 1L, 2L, 
1L, 1L, 2L), x4 = c(156L, 156L, 238L, 156L, 238L, 238L, 156L, 
156L, 156L), x5 = c(0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), x5.1 = c(31L, 
1L, 9L, 8L, 6L, 11L, 3L, 3L, 2L), x6 = structure(c(1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L), .Label = c("En", "RU"), class = "factor"), 
    x7 = structure(c(2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L), .Label = c("13", 
    "other"), class = "factor"), x8 = c(1L, 1L, 2L, 2L, 1L, 1L, 
    1L, 1L, 1L), x9 = c(0L, 1L, 0L, 0L, 2L, 3L, 2L, 0L, 0L), 
    x10 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L), x11 = c(745L, 
    120L, 140L, 200L, 130L, 410L, 460L, 460L, 675L), x12 = c(440L, 
    120L, 140L, 200L, 130L, 410L, 460L, 460L, 445L), x13 = c(2L, 
    0L, 0L, 0L, 0L, 0L, 2L, 1L, 1L), prediction = structure(c(1L, 
    5L, 6L, 8L, 7L, 2L, 3L, 3L, 4L), .Label = c("0.0646", "0.0713", 
    "0.1319", "0.2629", "0.3479", "0.3693", "0.4037", "0.4123"
    ), class = "factor")), class = "data.frame", row.names = c(NA, 
-9L))

and second dataset

data2=structure(list(pred = structure(c(4L, 2L, 1L, 3L, 1L, 5L, 7L, 
6L, 6L, 1L), .Label = c("0.0226902365684509", "0.0326902365684509", 
"0.0826902365684509", "0.1211001253128052", "0.411001253128052", 
"0.611001253128052", "0.64564001253128052"), class = "factor")), class = "data.frame", row.names = c(NA, 
-10L))

How for each variable x1-x13 create pivot table with data1$prediction and data2$pred aggerated by means

For example desired output for variable x1

x1  mean_prediction mean_pred
7   0.31            0.056
6   0.14            0.569

I.E in output must 12 tables in one dataframe. and not separated data frames. How to do it?

#update

structure(list(x1 = c(7L, 7L, 7L, 7L, 7L, 6L, 6L, 6L, 6L), x2 = c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), x3 = c(1L, 1L, 1L, 2L, 1L, 2L, 
1L, 1L, 2L), x4 = c(156L, 156L, 238L, 156L, 238L, 238L, 156L, 
156L, 156L), x5 = c(0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), x5.1 = c(31L, 
1L, 9L, 8L, 6L, 11L, 3L, 3L, 2L), x6 = structure(c(1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L), .Label = c("En", "RU"), class = "factor"), 
    x7 = structure(c(2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L), .Label = c("13", 
    "other"), class = "factor"), x8 = c(1L, 1L, 2L, 2L, 1L, 1L, 
    1L, 1L, 1L), x9 = c(0L, 1L, 0L, 0L, 2L, 3L, 2L, 0L, 0L), 
    x10 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L), x11 = c(745L, 
    120L, 140L, 200L, 130L, 410L, 460L, 460L, 675L), x12 = c(440L, 
    120L, 140L, 200L, 130L, 410L, 460L, 460L, 445L), x13 = c(2L, 
    0L, 0L, 0L, 0L, 0L, 2L, 1L, 1L), prediction = structure(c(1L, 
    5L, 6L, 8L, 7L, 2L, 3L, 3L, 4L), .Label = c("0.0646", "0.0713", 
    "0.1319", "0.2629", "0.3479", "0.3693", "0.4037", "0.4123"
    ), class = "factor"), pred = c(0.121100125, 0.032690237, 
    0.022690237, 0.082690237, 0.022690237, 0.411001253, 0.645640013, 
    0.611001253, 0.611001253)), class = "data.frame", row.names = c(NA, 
-9L))
psysky
  • 3,037
  • 5
  • 28
  • 64

2 Answers2

2

You can get the data in long format , group for each column name and value and take mean for prediction and pred columns.

library(dplyr)

df %>%
  mutate(across(x1:x13, as.character), 
         prediction = as.numeric(as.character(prediction))) %>%
  tidyr::pivot_longer(cols = x1:x13) %>%
  group_by(name, value) %>%
  summarise(across(c(prediction, pred), mean, na.rm = TRUE))

#   name  value prediction   pred
#   <chr> <chr>      <dbl>  <dbl>
# 1 x1    6         0.150  0.570 
# 2 x1    7         0.320  0.0564
# 3 x10   0         0.257  0.186 
# 4 x10   1         0.197  0.628 
# 5 x11   120       0.348  0.0327
# 6 x11   130       0.404  0.0227
# 7 x11   140       0.369  0.0227
# 8 x11   200       0.412  0.0827
# 9 x11   410       0.0713 0.411 
#10 x11   460       0.132  0.628 
# … with 39 more rows
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
1

Taking your dput. First convert the factors into characters for ease of dealing with.

data1$x6<-as.character(data1$x6)
data1$x7<-as.character(data1$x7)

Then, convert the value columns to numeric from character factors as they are in your code.

data1$prediction<-as.numeric(as.character.numeric_version(data1$prediction))
data2$pred<-as.numeric(as.character.numeric_version(data2$pred))

Due to the additional column in the data2 code, remove the last row of the data to use.

data1$pred<-data2$pred[1:nrow(data1)]

Here, is a function, which take the string name of the column then, creates a tibble with the needed data for the output, converts the grouping columns value to characters (as some of the columns contain numbers and some contain strings, this eliminates the error of not getting all the grouping values), then outputs the grouped means for the values in the column.

get_column_break_down<-function(colname, df=data1) {
  res_df<-tibble(group = as.character(df[,grep(paste0('^',colname,'$'), names(df))]),
  prediction=df$prediction,
  pred=df$pred)
  return(res_df %>%
       group_by(group) %>%
       summarize(mean_prediction = mean(prediction),
                 mean_pred = mean(pred)) %>%
       mutate(predictor = colname) %>%
       ungroup() %>%
       select(predictor, group, mean_prediction, mean_pred))
}

Get a vector of the columns names from data1

colname_vec<-names(data1[,1:14])

Create the initial data.frame or in this case tibble with the first column name.

df<-get_column_break_down(colname_vec[1])

Loop through the remaining columns, binding the rows to the df variable, or stacking the rows on top of each other.

for(n in colname_vec[2:length(colname_vec)]) {
    df<-bind_rows(df, get_column_break_down(n))
}

Finally here is the output.

Note, as it is a tibble, it is rounding the numbers, if you inspect the df variable in your enviroment or write it to a csv, the numbers will closely match you numbers.

df
# A tibble: 49 x 4
 predictor group mean_prediction mean_pred
 <chr>     <chr>           <dbl>     <dbl>
 1 x1        6               0.150    0.570 
 2 x1        7               0.320    0.0564
 3 x2        1               0.197    0.611 
 4 x2        2               0.257    0.191 
 5 x3        1               0.242    0.243 
 6 x3        2               0.249    0.368 
 7 x4        156             0.225    0.351 
 8 x4        238             0.281    0.152 
 9 x5        0               0.205    0.349 
10 x5        1               0.380    0.0577
# ... with 39 more rows
Michael Vine
  • 335
  • 1
  • 9