I am still a bit confused with the use of enquo and toString. In the example below I basically just try to filter a data frame and sum the rows in the end. I don't really understand why is enquo and toString doing the same for the first thing I want to do (filter --> option 1 and 2 gives the same result) but not for the second thing I wanna do (sum --> option 1 works but option 2 gives me an error). Is it just because I use it within a dplyr pipe?
library(dplyr)
library(tidyverse)
### define dataframe
dataframe_test <- data.frame(
column_test = c(100,99,99,90,89,50),
month_test = c("2020-09-01", "2020-09-01","2020-09-01", "2020-09-01","2020-10-01","2020-10-01")
)
test_function <- function(df, df_col_indicator, df_col_month, char_month) {
### define variables for enquo, ensym, toString
df_col_indicator_enquo <- enquo(df_col_indicator)
df_col_indicator_ensym <- ensym(df_col_indicator)
df_col_indicator_toString <- toString(df_col_indicator)
df_col_month_ensym <- ensym(df_col_month)
dataframe2 <- df %>%
filter(!!df_col_month_ensym == char_month) %>% # filter for month
slice_max(!!df_col_indicator_ensym, n = 3) %>% # slice top 3 observations
## two options for filter
# option 1
filter(!!df_col_indicator_ensym == df[2, df_col_indicator_toString]) # filter for observations with same observation as second row
# option 2
#filter(!!df_col_indicator_ensym == df[2, !!df_col_indicator_enquo])
## two options for sum
# option 1
bb <- sum(dataframe2[ , df_col_indicator_toString]) # sum up observations
# option 2
#bb <- sum(dataframe2[ , !!df_col_indicator_enquo])
return(bb)
}
test_function(df = dataframe_test, df_col_indicator = "column_test", df_col_month = "month_test" , char_month = "2020-09-01")
EDIT:
Thank you all for your answers. Hehe, ok I have to admitt that the example is a bit stupid, but I tried to keep it as simple as possible here. My initial problem is actually this one (see below). I basically try to select top 5 numbers of a column. There are three different outcomes. 1) If more than 5 are ==100, then I wanna randomly store 5 oberservation in list(indicator) the other observations in list(asterisk). 2) If not all observations are ==100 but there are ties (5th, 6th place have the same value), I wanna randomly pick those with ties and again put some in list(indicator)the other observations in list(asterisk). 3) If thre are no ties, just pick top 5 observations. My main problem now is if I want to run my function over a loop (with all the columns) at the very bottom. Somehow I always just get the first row as an outcome... I think I somehow don't understand how to propperly set variable names for the function within a loop...?
library(dplyr)
library(tidyverse)
remove(list = ls())
dataframe_test <- data.frame(
county_name = c("a", "b","c", "d","e", "f", "g", "h"),
column_test1 = c(100,100,100,100,100,100,50,50),
column_test2 = c(40,90,50,40,40,100,13,14),
column_test3 = c(100,90,50,40,30,40,100,50),
month = c("2020-09-01", "2020-09-01" ,"2020-09-01" ,"2020-09-01" ,"2020-09-01" ,"2020-09-01" ,"2020-08-01","2020-08-01"))
choose_top_5 <- function(df, df_col_indicator, df_col_month, char_month, numb_top, df_col_county) {
### enquo / ensym / deparse
df_col_indicator_enquo <- enquo(df_col_indicator)
df_col_indicator_ensym <- ensym(df_col_indicator)
df_col_month_ensym <- ensym(df_col_month)
df_col_month_enquo = enquo(df_col_month)
### filter month and top 5 observations
df_top <- df %>%
filter(!!df_col_month_ensym == char_month) %>%
slice_max(!!df_col_indicator_ensym, n = numb_top) %>%
select(!!df_col_county, !!df_col_month_ensym, !!df_col_indicator_ensym)
### if there are more than "numb_top" values and all equals to 100 --> randomly pick "numb_top"
if (nrow(df_top) > numb_top &
sum(df_top[ , df_col_indicator ]) == 100*nrow(df_top) ) {
## randomly pick "numb_top" out of all
random_shuffle <- df_top[sample(nrow(df_top)),]
indicator <- random_shuffle[1:numb_top,]
asterisk <- random_shuffle[(numb_top+1):nrow(random_shuffle),]
## return "numb_top" and put names of others in asterisk
return_list <- list(indicator, asterisk)
### if there are more than "numb_top" values but not all 100 (e.g. 100, 100, 100, 99, 99, 99)
## --> pick randomly 99 values
} else if (nrow(df_top) > numb_top) {
### filter for all observations that have the same value as "numb_top"
df_treshold <- df_top %>%
filter(!!df_col_indicator_ensym == df_top[numb_top, df_col_indicator])
## randomly shuffle the observations
random_shuffle <- df_treshold[sample(nrow(df_treshold)),]
## combine observations again an pick "numb_top"
combine <- rbind(df_top[1:(nrow(df_top)-nrow(df_treshold)), ], random_shuffle)
indicator <- combine[1:numb_top,]
asterisk <- combine[(numb_top+1):nrow(combine),]
## return "numb_top" and put names of others in asterisk
return_list <- list(indicator, asterisk)
### if there are not more than "numb_top" values
} else {
indicator <- df_top
asterisk <- NA
## return "numb_top", asterisk is NA
return_list <- list(indicator, asterisk)
}
return(return_list)
}
### function for 1 column
a=choose_top_5(df = dataframe_test, df_col_indicator = "column_test3",
df_col_month = "month", char_month = "2020-09-01", numb_top = 5,
df_col_county = "county_name")
a
### function over all columns and store in list
all_indicators <- c("column_test1","column_test2","column_test3")
my_list <- list()
for (i in all_indicators) {
my_list[[i]] <- choose_top_5(df = dataframe_test, df_col_indicator = i,
df_col_month = "month", char_month = "2020-09-01", numb_top = 5,
df_col_county = "county_name")
}
my_list