0

I'm taking in a large data frame and want to perform analyses on subsets of the original data (mean, st deviation, etc). Right now I have code for making new data frames with the columns of interest as shown :

df1 <- data_clean %>%
  filter(sex=="Male" & experiment_group == "Saline")  %>%
  mutate(avg_presses = rowMeans(select(., c("total1", "total2", "total3")), na.rm=TRUE))

cohort_avg <- c()         #initialize cohort avgs empty vector
cohort_std <- c()
for (cohort_num in 1:max(df1$cohort)) {     # loop through all cohorts 
  data_build <- CI_fem_coca                         # make temp dataframe initialized w original 
  for (i in 1:nrow(df1)) {                  #loop through all rows
    if (df1$cohort[i] == cohort_num) {      #if row cohort number is equal to the cohort # to group 
      data_build <- df1 %>%
        filter(cohort==cohort_num) %>%
        mutate(avgs = mean(avg_presses, na.rm=TRUE),    #add new column avgs that is the cohort avg 
               std = sd(avg_presses, na.rm=TRUE))   
    }
  }
  cohort_avg<- c(cohort_avg, data_build$avgs)     #add cohort group avg to vector 
  cohort_std <- c(cohort_std, data_build$std)
}
df1 <- df1 %>%                    #add cohort avgs to original dataframe 
  add_column(cohort_avgs = cohort_avg, cohort_sd=cohort_std ) 


df1 <- df1 %>%
  mutate(z_score = (avg_presses - cohort_avgs)/cohort_sd)

This works fine but I have four data frames to perform this exact analysis on, and writing it 4 times seems clunky. Is there a way to add the four data frames to a list and then iterate over each data frame? I tried :

CI_list <- list(df1, df2, df3, df4)

for (i in 1:length(CI_list)) {
  cohort_avg <- c()         
  cohort_std <- c()
  for (cohort_num in 1:max(CI_list[[i]]$cohort)) {     
    data_build <- CI_list[[i]]                        
    for (i in 1:nrow(CI_list[[i]])) {                 
      if (CI_list[[i]]$cohort[i] == cohort_num) {      
        data_build <- CI_list[[i]] %>%
          filter(cohort==cohort_num) %>%
          mutate(avgs = mean(avg_presses, na.rm=TRUE),    
                 std = sd(avg_presses, na.rm=TRUE))   
      }
    }
    cohort_avg<- c(cohort_avg, data_build$avgs)     
    cohort_std <- c(cohort_std, data_build$std)
  }
  CI_list[[i]] <- CI_list[[i]] %>%                    
    add_column(cohort_avgs = cohort_avg, cohort_sd=cohort_std ) 
  
  CI_list[[i]] <- CI_list[[i]] %>%
    mutate(z_score = (avg_Infusions - cohort_avgs)/cohort_sd)
  
  
  
}

but i get a subscript out of bounds error. is there a better way to do this?

Edit: in trying to use a function and lapply I did:

find_CI_zscore <- function(df) {
  for (i in 1:length(df)) {
    cohort_avg <- c()         #initialize cohort avgs empty vector
    cohort_std <- c()
    for (cohort_num in 1:max(df$cohort)) {     # loop through all cohorts 
      data_build <- df                         # make temp dataframe initialized w original data
      for (i in 1:nrow(df)) {                  #loop through all rows
        if (df$cohort[i] == cohort_num) {      #if row cohort number is equal to the cohort # to group together
          data_build <- df %>%
            filter(cohort==cohort_num) %>%
            mutate(avgs = mean(avg_Infusions, na.rm=TRUE),    #add new column avgs that is the cohort avg infusion #
                   std = sd(avg_Infusions, na.rm=TRUE))   
        }
      }
      cohort_avg<- c(cohort_avg, data_build$avgs)     #add cohort group avg to vector 
      cohort_std <- c(cohort_std, data_build$std)
    }
    df <- df %>%                    #add cohort avgs to original dataframe 
      add_column(cohort_avgs = cohort_avg, cohort_sd=cohort_std ) 
    
    CI_list <- CI_list %>%
      mutate(z_score = (avg_Infusions - cohort_avgs)/cohort_sd)
  }
}

for (i in 1:length(CI_list)) {
  lapply(CI_list[[i]], find_CI_zscore)

And I get this error: Error in df$cohort : $ operator is invalid for atomic vectors How do I use lapply to take in data frames not vectors?

New edit: this is the dput let me know if that worked:

> dput(list(df1[1:7, ], df2[1:7, ]))
list(structure(list(cohort = c(1L, 1L, 1L, 1L, 1L, 1L, 1L), avg_Infusions = c(31.3333333333333, 
32.6666666666667, 4, 20, 7, 22.6666666666667, 11.3333333333333
)), row.names = c(NA, 7L), class = "data.frame"), structure(list(
    cohort = c(1L, 1L, 1L, 1L, 1L, 1L, 1L), avg_Infusions = c(6.66666666666667, 
    17.6666666666667, 17.3333333333333, 0.333333333333333, 10, 
    8.66666666666667, 20)), row.names = c(NA, 7L), class = "data.frame"))
  • 2
    You can wrap the code in a function and apply the code on the list by looping with `lapply` – akrun Feb 16 '23 at 19:53
  • 2
    This line looks suspicious `if (CI_list[[i]]$cohort[i] == cohort_num)`. The `$cohort[i]` could be problematic. Hard to tell without seeing any data though. – Gregor Thomas Feb 16 '23 at 19:58
  • 2
    Please consider to show a small reprducible example with `dput` – akrun Feb 16 '23 at 19:59
  • 2
    If your dataframes share the same structure, it might be much more concise and faster to collate them into one frame which you then subject to groupwise manipulations with the package of your choice, be it {base}, {dplyr} or {data.table}: either will probably beat looping. – I_O Feb 16 '23 at 20:32
  • 2
    Does this answer your question? [Using lapply to apply a function over list of data frames and saving output to files with different names](https://stackoverflow.com/questions/17018138/using-lapply-to-apply-a-function-over-list-of-data-frames-and-saving-output-to-f) – divibisan Feb 16 '23 at 20:32
  • can you please clarify how to use dput? I am new to stack overflow. – detective-captain42 Feb 16 '23 at 20:44
  • 1
    `dput()` makes a copy/pasteable version of an R object. So if you had a single data frame `dput(your_data[1:10, ])` would make a copy/pasteable version of the first 10 rows of `your_data`, including all class and structure information. In your case, you could maybe share something like `dput(list(df1[1:10, ], df2[1:10, ]))` to give us a 10 rows of two data frames. But the point is to give us some sample data that we can run your code on to try to debug it. You could also give us code to simulate sample data, or use built-in data. Just try to both make it **minimal** and **reproducible**. – Gregor Thomas Feb 16 '23 at 20:48
  • For more detail, have a look at our FAQ [How to make a great reproducible example in R](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example). – Gregor Thomas Feb 16 '23 at 20:48

1 Answers1

0

If you want to use the for loop method, get is your friend. In the example below, dfname can be used to call columns and rows by index such as dfname[,2] or by column name dfname$cohort

for(i in 1:4) {
  dfname <- get(paste0("df", i))
  dfname$cohort <- (do something)
  dfname[,2] <- (do something else)
}

However, akun's suggestion to create a function or I_O's suggestion to combine your data are much faster.

L Tyrone
  • 1,268
  • 3
  • 15
  • 24