Replace multiple for loops with something more efficient in R

Question

My question is regarding to for loops and alternatives:

I have multiple consecutive for loops for a big data frame. In every for loop there are new variables/dataframes defined which are necessary for computing the following for loop. Is there a way (maybe lapply or similar) which I can use in this case so that the process will be faster?

databackend2 = data.frame()
databackend2 = structure(list( Class = c("T1", "T1", "T2", "T3", "T5", "Q12"), Places = c("Orlando", "Orlando", "Boston", "LA", "New York", "New York"), Names = c("Planist", "Plantist", "Engi", "Engi", "Shifter", "Automatist"), Final.Work= c(0.08, 0.05, 0.06, 0.05, 0.055, 0.043), Parent = c(NA, NA, NA, "Planist", "Engi", "Engi"), d_stage = c(1, 2, 2, 3, 5, 2))
databackend3 = data.frame()
databackend3 = structure(list( Type = NA, Places = c("Orlando", "Colorado", "Boston", "LA", "New York", "Florida"), D.Stage = c(1.4, 1.5, 2.3, 3.4, 5.1, 2.5), X4= c(3, 3, 4, 5, 5, 4), X5=c(4, 5, 5, 6, 6, 6), Names = c("Sum", "Plantist", "Engi", "Fieldor", "Shifter", "Automatist"), Cu.No.Of.Emp = c(32, 7, 8, 9, 2, 6), Sh.fact = c(NA, 1, 1, 3, 3, 4), Cu.Stage = c(1, 1, 2, 3, 5, 2), One.Target.Stage = c(3, 3, 4, 5, 5, 4), Two.Target.Stage = c(4, 5, 5, 6, 6, 6))

for (names in all_names) {
  n_cu_norm_fact = n_cu = databackend3 %>% filter(Names == names) %>% pull(Cu.No..Of.Emp)
  sh_fact_nas = databackend3 %>% filter(Roles == role) %>% pull(Sh.fact)
  if (is.na(n_cu) | (n_cu == 0)) {
    n_cu = 0
    n_cu_norm_fact = 1
  }
  n_emp_df[n_emp_df$names == names, "n_cu"] = n_cu


  df_names_one = data.frame()
  df_names_two = data.frame()

  all_places_names = databackend2 %>% filter(Names == names) %>% pull(Places) %>% unique()
  

  sum_of_cu_work_names_df = data.frame()
  for (place in all_places_names) {
    ds_cu = databackend3 %>% filter(Places == place) %>% pull(Cu.Stage)
    df_i = databackend2 %>% filter(Places == place, Places == place, d_stage == ds_cu) %>% select(Class, Final.Work)
    sum_of_cu_work_names_df = sum_of_cu_work_names_df %>% bind_rows(df_i)
  }
  sum_of_cu_work_names = sum_of_cu_work_names_df$Final.Work %>% sum()
  sum_of_cu_work_names = ifelse(n_cu == 0, 1, sum_of_cu_work_names) 
  
  for (place in all_places_names) {
    ds_cu = databackend3 %>% filter(Places == place) %>% pull(Cu.Stage)
    ds_target_one = databackend3 %>% filter(Places == place) %>% pull(One.Target.Stage)
    ds_target_two = databackend3 %>% filter(Places == place) %>% pull(Two.Target.Stage)
    
    df_names_place_target_one = databackend2 %>% filter(Names == names, Places == place, d_stage == ds_target_one)
    df_names_places_target_two = databackend2 %>% filter(Names == names, Places == place, d_stage == ds_target_two)
    df_names_place_target_cu = databackend2 %>% filter(Names == names, Places == place, d_stage == ds_cu) %>% select(Class, Final.Work)
    colnames(df_names_place_cu)[2] = "Cu.Work"
    
    df_names_place_target_one = df_names_place_target_one %>% left_join(df_names_place_cu)
    df_names_place_target_two = df_names_place_target_one %>% left_join(df_names_place_cu)
    df_names_place_target_one$work_norm = df_names_place_target_one$Final.Work / sum_of_cu_work_names
    df_names_place_target_two$work_norm = df_names_place_target_two$Final.Work / sum_of_cu_work_names
    
    for (class in df_names_place_target_one$Class) {
      source_names = df_names_place_target_one %>% filter(Class == class) %>% slice(1) %>% pull(Parent) 
      if (is.na(source_names)) next
      n_source_names = databackend3 %>% filter(Names == source_names) %>% pull(Cu.No..Of.Emp)
      n_source_names = ifelse(n_source_names == 0, 1, n_source_names)
      sh_fact_source = databackend3 %>% filter(Names == source_names) %>% pull(Sh.fact)
      work_old = df_names_place_target_one %>% filter(Class == class) %>% pull(Final.Work)
      sum_of_cu_work_source = databackend2 %>% filter(Names == source_names, d_stage == ds_cu) %>% pull(Final.Work) %>% sum()
      df_names_place_target_one[df_names_place_target_one$Class == class, "work_norm"] = (n_source_names / sh_fact_source) * (work_old / sum_of_cu_work_source) / (n_cu_norm_fact / sh_fact_names)
    }
    
    
     for (class in df_names_place_target_two$Class) {
      source_names = df_names_place_target_two %>% filter(Class == class) %>% slice(1) %>% pull(Parent) 
      if (is.na(source_names)) next
      n_source_names = databackend3 %>% filter(Names == source_names) %>% pull(Cu.No..Of.Emp)
      n_source_names = ifelse(n_source_names == 0, 1, n_source_names)
      sh_fact_source = databackend3 %>% filter(Names == source_names) %>% pull(Sh.fact)
      work_old = df_names_place_target_two %>% filter(Class == class) %>% pull(Final.Work)
      sum_of_cu_work_source = databackend2 %>% filter(Names == source_names, d_stage == ds_cu) %>% pull(Final.Work) %>% sum()
      df_names_place_target_two[df_names_place_target_two$Class == class, "work_norm"] = (n_source_names / sh_fact_source) * (work_old / sum_of_cu_work_source) / (n_cu_norm_fact / sh_fact_names)
    }
    
    df_names_one = df_names_one %>% bind_rows(df_names_place_target_one)
    df_names_two = df_names_two %>% bind_rows(df_names_place_target_two)  }
  
  write.csv2(df_names_one, file = paste0("debugging\\one\\", names, ".csv"), row.names = FALSE)
  write.csv2(df_names_two, file = paste0("debugging\\bl\\", names, ".csv"), row.names = FALSE)
  
  sum_work_norm_one = df_names_one %>% pull(work_norm) %>% sum(na.rm = TRUE)
  sum_work_norm_two = df_names_two %>% pull(work_norm) %>% sum(na.rm = TRUE)
  n_emp_df[n_emp_df$names == names, "n_target_one"] = ifelse(n_cu > 0, sum_work_norm_one * n_cu, sum_work_norm_one * 1)
  n_emp_df[n_emp_df$names == names, "n_target_two"] = ifelse(n_cu > 0, sum_work_norm_two * n_cu, sum_work_norm_two * 1)
}

*I have a code that uses multiple while loops, is there a way i can just use vectorization?* do you see something wrong with that question? Once you answer my question then we will answer your question. If you feel my question lacks information, then that information that you feel it lacks include it in your question — Onyambu, May 19 '22 at 12:41
The code is a quite long and not very neat which is why I didn´t include it in the question. I was rather asking whether there was a general idea/technique used for these cases. — math_ist, May 19 '22 at 12:59
@math_ist, the answer to your question is "yes! (probably)". But to get more specific than that, you'll have to give us more information (i.e. the code you're using and your data). Clearly, we're interested in your question and eager to help. Just share a little more detail about your case for a more detailed answer :) — Skaqqs, May 19 '22 at 13:20
Probably best to share a specific loop with reproducible input data. — s_baldur, May 19 '22 at 16:01
@math_ist, I see your code; thanks for sharing it. It looks like there might be some things missing (for example, where is `all_names` defined?). It would also be very helpful for you to include comments in your code that decribe why you are doing what you are doing in each chunk, and a description of the overall goal of the script. As an aside, consider there are stylistic and functional differences between assigning values with `<-` and `=` https://stackoverflow.com/questions/1741820/what-are-the-differences-between-and-assignment-operators-in-r — Skaqqs, May 20 '22 at 12:05
`lapply` won't really give you a performance increase over a for loop. — user438383, Jun 14 '22 at 09:56

Replace multiple for loops with something more efficient in R

0 Answers0