How to create a loop over different datasets?

Question

i need your help. I want to run a code over different datasets. They are called dfQB, dfRB, dfOL, dfFB, dfWR, dfTE, dfST, dfDB, dfLB, dfDL and dfDE. I have this code. Is it possible to add a second loop that does this code for every of the datasets mentioned above? Help is much appreciated!

for(i in 1:nrow(df)){
  ecdf_fun <- function(x,perc) ecdf(x)(perc)
  de=df[i,5:10]
  a=(1-ecdf_fun(df$Forty,de[1]))
  b=(ecdf_fun(df$Vertical,de[2]))
  c=(ecdf_fun(df$BenchReps,de[3]))
  d=(ecdf_fun(df$BroadJump,de[4]))
  e=(1-ecdf_fun(df$Cone,de[5]))
  f=(1-ecdf_fun(df$Shuttle,de[6]))
  nenner=6-sum(is.na(a), is.na(b),is.na(c), is.na(d),is.na(e), is.na(f))
  if (is.na(a)) {a <- 0}
  if (is.na(b)) {b <- 0}
  if (is.na(c)) {c <- 0}
  if (is.na(d)) {d <- 0}
  if (is.na(e)) {e <- 0}
  if (is.na(f)) {f <- 0}
  df$RAS[i]=((a+b+c+d+e+f)/nenner)*10
}

Getting the appropriate help!

Well df is the whole dataset, but i just used it as an example in here. The dataframes the code shall run over are "dfQB, dfRB, dfOL, dfFB, dfWR, dfTE, dfST, dfDB, dfLB, dfDL and dfDE" — manofthousandnames, Apr 09 '23 at 21:28

score 3 · Answer 1 · answered Apr 09 '23 at 21:35

First and foremost, avoid maintaining many similarly structured data frames as separate objects in your global environment. Instead, store such similar data frames in a list. From there, you can run lapply on the list to uniformly apply on any defined or user defined method.

# NAMED LIST OF DFS
fb_position_dfs <- list(
   QB=dfQB, RB=dfRB, OL=dfOL, FB=dfFB, WR=dfWR, TE=dfTE, 
   ST=dfST, DB=dfDB, LB=dfLB, DL=dfDL, DE=dfDE
)

# REMOVE SEPARATE OBJECTS
rm(dfQB, dfRB, dfOL, dfFB, dfWR, dfTE, dfST, dfDB, dfLB, dfDL, dfDE)

# USER-DEFINED METHOD
df_proc <- function(df) {
  for(i in 1:nrow(df)){ 
    ecdf_fun <- function(x,perc) ecdf(x)(perc) 
    de=df[i,5:10] 
    a=(1-ecdf_fun(df$Forty,de[1]))
    b=(ecdf_fun(df$Vertical,de[2]))
    c=(ecdf_fun(df$BenchReps,de[3])) 
    d=(ecdf_fun(df$BroadJump,de[4])) 
    e=(1-ecdf_fun(df$Cone,de[5])) 
    f=(1-ecdf_fun(df$Shuttle,de[6]))
    nenner=6-sum(is.na(a), is.na(b),is.na(c), is.na(d),is.na(e), is.na(f)) 

    if (is.na(a)) {a <- 0}
    if (is.na(b)) {b <- 0} 
    if (is.na(c)) {c <- 0} 
    if (is.na(d)) {d <- 0} 
    if (is.na(e)) {e <- 0} 
    if (is.na(f)) {f <- 0} 
 
    df$RAS[i]=((a+b+c+d+e+f)/nenner)*10 
  }
  return(df)
}

# ITERATE THROUGH DFS APPLYING METHOD
new_fb_position_dfs <- lapply(fb_position_dfs, df_proc)

If by chance the data frames originating from a master data frame split by position, use by (object-oriented wrapper to tapply) to apply user-define method on the splits:

# ITERATE THROUGH SPLITS OF DF AND APPLY METHOD
new_fb_position_dfs <- by(fb_master_df, fb_master_df$position, df_proc)

score 0 · Answer 2 · answered Apr 09 '23 at 21:32

You can put your data.frames in a list and use purrr::map to iterate over each element.

# install.packages("purrr")

library(purrr)

datasets <- list(
  dfQB,
  dfRB,
  dfOL,
  dfFB,
  dfWR,
  dfTE,
  dfST,
  dfDB,
  dfLB,
  dfDL,
  dfDE
)

# `map` returns a list with the result of calling the function (second argument)
# with each element of the first argument (`datasets`).
datasets_transformed <- map(
  datasets,
  \(df) {
    for(i in 1:nrow(df)){
      ecdf_fun <- function(x,perc) ecdf(x)(perc)
      de=df[i,5:10]
      a=(1-ecdf_fun(df$Forty,de[1]))
      b=(ecdf_fun(df$Vertical,de[2]))
      c=(ecdf_fun(df$BenchReps,de[3]))
      d=(ecdf_fun(df$BroadJump,de[4]))
      e=(1-ecdf_fun(df$Cone,de[5]))
      f=(1-ecdf_fun(df$Shuttle,de[6]))
      nenner=6-sum(is.na(a), is.na(b),is.na(c), is.na(d),is.na(e), is.na(f))
      if (is.na(a)) {a <- 0}
      if (is.na(b)) {b <- 0}
      if (is.na(c)) {c <- 0}
      if (is.na(d)) {d <- 0}
      if (is.na(e)) {e <- 0}
      if (is.na(f)) {f <- 0}
      df$RAS[i]=((a+b+c+d+e+f)/nenner)*10
    }
    
    df
  }
)

Len Greski · Answer 3 · 2023-04-10T00:44:17.543

If you're not going to assign the data frames to a list, another approach is to simply list the data frame names in a vector, and use get() within an lapply() function to access the object from the parent environment.

Absent a minimal reproducible example, code to do this looks like the following:

dfList <- c("dfQB","dfRB","dfOL") # a subset of the data frames

updatedData <- lapply(dfList,function(x){
     df <- get(x) # get the actual data from parent environment 
     for(i in 1:nrow(df)){
          ecdf_fun <- function(x,perc) ecdf(x)(perc)
          de=df[i,5:10]
          a=(1-ecdf_fun(df$Forty,de[1]))
          b=(ecdf_fun(df$Vertical,de[2]))
          c=(ecdf_fun(df$BenchReps,de[3]))
          d=(ecdf_fun(df$BroadJump,de[4]))
          e=(1-ecdf_fun(df$Cone,de[5]))
          f=(1-ecdf_fun(df$Shuttle,de[6]))
          nenner=6-sum(is.na(a), is.na(b),is.na(c), is.na(d),is.na(e), is.na(f))
          if (is.na(a)) {a <- 0}
          if (is.na(b)) {b <- 0}
          if (is.na(c)) {c <- 0}
          if (is.na(d)) {d <- 0}
          if (is.na(e)) {e <- 0}
          if (is.na(f)) {f <- 0}
          df$RAS[i]=((a+b+c+d+e+f)/nenner)*10
     }
     df # return to parent environment 
})

At this point, the object updatedData is a list of data frames.

As minimal reproducible example we'll download and update one column from the nine generations of Pokémon stats.

First, we'll download the data and unzip it to a subdirectory of the current working directory.

   download.file("https://raw.githubusercontent.com/lgreski/pokemonData/master/PokemonData.zip",
              "pokemonData.zip",
              method="curl",mode="wb")
   unzip("pokemonData.zip",exdir="./pokemonData")

Next, we'll create a vector containing the .csv files we downloaded.

thePokemonFiles <- list.files("./pokemonData",pattern = ".csv",
                              full.names=TRUE)

Next, we use the vector of file names to read the data into data frames and assign them as data frames in the global environment via the assign() function. Yes, I am aware that it's easier to work with the data frames in a list, but this replicates the "current state" of the original post.

pokemonDataFiles <- lapply(thePokemonFiles,function(x) {
     df <- read.csv(x,stringsAsFactors=FALSE)
     dfName <- substr(x,15,19)
     assign(dfName,df,envir = .GlobalEnv) # assign to global environment 
})

Next, we'll create a vector to represent the names of the data frames we created.

Finally, we multiply the HP stat by ten in each data frame, return the data frame to a list, and compare the original and updated data for one data frame.

theNames <- paste0("gen0",1:9)
updatedData <- lapply(theNames,function(df){
     x <- get(df)
     x$HP <- x$HP * 10 
     x
})
# compare the first few rows of gen01 Pokemon HP
head(data.frame(original.HP = gen01$HP,updatedHP =updatedData[[1]][["HP"]]))

...and the output:

> head(data.frame(original.HP = gen01$HP,updatedHP = updatedData[[1]][["HP"]]))
  original.HP updatedHP
1          45       450
2          60       600
3          80       800
4          39       390
5          58       580
6          78       780

How to create a loop over different datasets?

3 Answers3