I'm taking in a large data frame and want to perform analyses on subsets of the original data (mean, st deviation, etc). Right now I have code for making new data frames with the columns of interest as shown :
df1 <- data_clean %>%
filter(sex=="Male" & experiment_group == "Saline") %>%
mutate(avg_presses = rowMeans(select(., c("total1", "total2", "total3")), na.rm=TRUE))
cohort_avg <- c() #initialize cohort avgs empty vector
cohort_std <- c()
for (cohort_num in 1:max(df1$cohort)) { # loop through all cohorts
data_build <- CI_fem_coca # make temp dataframe initialized w original
for (i in 1:nrow(df1)) { #loop through all rows
if (df1$cohort[i] == cohort_num) { #if row cohort number is equal to the cohort # to group
data_build <- df1 %>%
filter(cohort==cohort_num) %>%
mutate(avgs = mean(avg_presses, na.rm=TRUE), #add new column avgs that is the cohort avg
std = sd(avg_presses, na.rm=TRUE))
}
}
cohort_avg<- c(cohort_avg, data_build$avgs) #add cohort group avg to vector
cohort_std <- c(cohort_std, data_build$std)
}
df1 <- df1 %>% #add cohort avgs to original dataframe
add_column(cohort_avgs = cohort_avg, cohort_sd=cohort_std )
df1 <- df1 %>%
mutate(z_score = (avg_presses - cohort_avgs)/cohort_sd)
This works fine but I have four data frames to perform this exact analysis on, and writing it 4 times seems clunky. Is there a way to add the four data frames to a list and then iterate over each data frame? I tried :
CI_list <- list(df1, df2, df3, df4)
for (i in 1:length(CI_list)) {
cohort_avg <- c()
cohort_std <- c()
for (cohort_num in 1:max(CI_list[[i]]$cohort)) {
data_build <- CI_list[[i]]
for (i in 1:nrow(CI_list[[i]])) {
if (CI_list[[i]]$cohort[i] == cohort_num) {
data_build <- CI_list[[i]] %>%
filter(cohort==cohort_num) %>%
mutate(avgs = mean(avg_presses, na.rm=TRUE),
std = sd(avg_presses, na.rm=TRUE))
}
}
cohort_avg<- c(cohort_avg, data_build$avgs)
cohort_std <- c(cohort_std, data_build$std)
}
CI_list[[i]] <- CI_list[[i]] %>%
add_column(cohort_avgs = cohort_avg, cohort_sd=cohort_std )
CI_list[[i]] <- CI_list[[i]] %>%
mutate(z_score = (avg_Infusions - cohort_avgs)/cohort_sd)
}
but i get a subscript out of bounds error. is there a better way to do this?
Edit: in trying to use a function and lapply I did:
find_CI_zscore <- function(df) {
for (i in 1:length(df)) {
cohort_avg <- c() #initialize cohort avgs empty vector
cohort_std <- c()
for (cohort_num in 1:max(df$cohort)) { # loop through all cohorts
data_build <- df # make temp dataframe initialized w original data
for (i in 1:nrow(df)) { #loop through all rows
if (df$cohort[i] == cohort_num) { #if row cohort number is equal to the cohort # to group together
data_build <- df %>%
filter(cohort==cohort_num) %>%
mutate(avgs = mean(avg_Infusions, na.rm=TRUE), #add new column avgs that is the cohort avg infusion #
std = sd(avg_Infusions, na.rm=TRUE))
}
}
cohort_avg<- c(cohort_avg, data_build$avgs) #add cohort group avg to vector
cohort_std <- c(cohort_std, data_build$std)
}
df <- df %>% #add cohort avgs to original dataframe
add_column(cohort_avgs = cohort_avg, cohort_sd=cohort_std )
CI_list <- CI_list %>%
mutate(z_score = (avg_Infusions - cohort_avgs)/cohort_sd)
}
}
for (i in 1:length(CI_list)) {
lapply(CI_list[[i]], find_CI_zscore)
And I get this error: Error in df$cohort : $ operator is invalid for atomic vectors How do I use lapply to take in data frames not vectors?
New edit: this is the dput let me know if that worked:
> dput(list(df1[1:7, ], df2[1:7, ]))
list(structure(list(cohort = c(1L, 1L, 1L, 1L, 1L, 1L, 1L), avg_Infusions = c(31.3333333333333,
32.6666666666667, 4, 20, 7, 22.6666666666667, 11.3333333333333
)), row.names = c(NA, 7L), class = "data.frame"), structure(list(
cohort = c(1L, 1L, 1L, 1L, 1L, 1L, 1L), avg_Infusions = c(6.66666666666667,
17.6666666666667, 17.3333333333333, 0.333333333333333, 10,
8.66666666666667, 20)), row.names = c(NA, 7L), class = "data.frame"))