0

My question is regarding to for loops and alternatives:

I have multiple consecutive for loops for a big data frame. In every for loop there are new variables/dataframes defined which are necessary for computing the following for loop. Is there a way (maybe lapply or similar) which I can use in this case so that the process will be faster?

databackend2 = data.frame()
databackend2 = structure(list( Class = c("T1", "T1", "T2", "T3", "T5", "Q12"), Places = c("Orlando", "Orlando", "Boston", "LA", "New York", "New York"), Names = c("Planist", "Plantist", "Engi", "Engi", "Shifter", "Automatist"), Final.Work= c(0.08, 0.05, 0.06, 0.05, 0.055, 0.043), Parent = c(NA, NA, NA, "Planist", "Engi", "Engi"), d_stage = c(1, 2, 2, 3, 5, 2))
databackend3 = data.frame()
databackend3 = structure(list( Type = NA, Places = c("Orlando", "Colorado", "Boston", "LA", "New York", "Florida"), D.Stage = c(1.4, 1.5, 2.3, 3.4, 5.1, 2.5), X4= c(3, 3, 4, 5, 5, 4), X5=c(4, 5, 5, 6, 6, 6), Names = c("Sum", "Plantist", "Engi", "Fieldor", "Shifter", "Automatist"), Cu.No.Of.Emp = c(32, 7, 8, 9, 2, 6), Sh.fact = c(NA, 1, 1, 3, 3, 4), Cu.Stage = c(1, 1, 2, 3, 5, 2), One.Target.Stage = c(3, 3, 4, 5, 5, 4), Two.Target.Stage = c(4, 5, 5, 6, 6, 6))

for (names in all_names) {
  n_cu_norm_fact = n_cu = databackend3 %>% filter(Names == names) %>% pull(Cu.No..Of.Emp)
  sh_fact_nas = databackend3 %>% filter(Roles == role) %>% pull(Sh.fact)
  if (is.na(n_cu) | (n_cu == 0)) {
    n_cu = 0
    n_cu_norm_fact = 1
  }
  n_emp_df[n_emp_df$names == names, "n_cu"] = n_cu


  df_names_one = data.frame()
  df_names_two = data.frame()

  all_places_names = databackend2 %>% filter(Names == names) %>% pull(Places) %>% unique()
  

  sum_of_cu_work_names_df = data.frame()
  for (place in all_places_names) {
    ds_cu = databackend3 %>% filter(Places == place) %>% pull(Cu.Stage)
    df_i = databackend2 %>% filter(Places == place, Places == place, d_stage == ds_cu) %>% select(Class, Final.Work)
    sum_of_cu_work_names_df = sum_of_cu_work_names_df %>% bind_rows(df_i)
  }
  sum_of_cu_work_names = sum_of_cu_work_names_df$Final.Work %>% sum()
  sum_of_cu_work_names = ifelse(n_cu == 0, 1, sum_of_cu_work_names) 
  
  for (place in all_places_names) {
    ds_cu = databackend3 %>% filter(Places == place) %>% pull(Cu.Stage)
    ds_target_one = databackend3 %>% filter(Places == place) %>% pull(One.Target.Stage)
    ds_target_two = databackend3 %>% filter(Places == place) %>% pull(Two.Target.Stage)
    
    df_names_place_target_one = databackend2 %>% filter(Names == names, Places == place, d_stage == ds_target_one)
    df_names_places_target_two = databackend2 %>% filter(Names == names, Places == place, d_stage == ds_target_two)
    df_names_place_target_cu = databackend2 %>% filter(Names == names, Places == place, d_stage == ds_cu) %>% select(Class, Final.Work)
    colnames(df_names_place_cu)[2] = "Cu.Work"
    
    df_names_place_target_one = df_names_place_target_one %>% left_join(df_names_place_cu)
    df_names_place_target_two = df_names_place_target_one %>% left_join(df_names_place_cu)
    df_names_place_target_one$work_norm = df_names_place_target_one$Final.Work / sum_of_cu_work_names
    df_names_place_target_two$work_norm = df_names_place_target_two$Final.Work / sum_of_cu_work_names
    
    for (class in df_names_place_target_one$Class) {
      source_names = df_names_place_target_one %>% filter(Class == class) %>% slice(1) %>% pull(Parent) 
      if (is.na(source_names)) next
      n_source_names = databackend3 %>% filter(Names == source_names) %>% pull(Cu.No..Of.Emp)
      n_source_names = ifelse(n_source_names == 0, 1, n_source_names)
      sh_fact_source = databackend3 %>% filter(Names == source_names) %>% pull(Sh.fact)
      work_old = df_names_place_target_one %>% filter(Class == class) %>% pull(Final.Work)
      sum_of_cu_work_source = databackend2 %>% filter(Names == source_names, d_stage == ds_cu) %>% pull(Final.Work) %>% sum()
      df_names_place_target_one[df_names_place_target_one$Class == class, "work_norm"] = (n_source_names / sh_fact_source) * (work_old / sum_of_cu_work_source) / (n_cu_norm_fact / sh_fact_names)
    }
    
    
     for (class in df_names_place_target_two$Class) {
      source_names = df_names_place_target_two %>% filter(Class == class) %>% slice(1) %>% pull(Parent) 
      if (is.na(source_names)) next
      n_source_names = databackend3 %>% filter(Names == source_names) %>% pull(Cu.No..Of.Emp)
      n_source_names = ifelse(n_source_names == 0, 1, n_source_names)
      sh_fact_source = databackend3 %>% filter(Names == source_names) %>% pull(Sh.fact)
      work_old = df_names_place_target_two %>% filter(Class == class) %>% pull(Final.Work)
      sum_of_cu_work_source = databackend2 %>% filter(Names == source_names, d_stage == ds_cu) %>% pull(Final.Work) %>% sum()
      df_names_place_target_two[df_names_place_target_two$Class == class, "work_norm"] = (n_source_names / sh_fact_source) * (work_old / sum_of_cu_work_source) / (n_cu_norm_fact / sh_fact_names)
    }
    
    df_names_one = df_names_one %>% bind_rows(df_names_place_target_one)
    df_names_two = df_names_two %>% bind_rows(df_names_place_target_two)  }
  
  write.csv2(df_names_one, file = paste0("debugging\\one\\", names, ".csv"), row.names = FALSE)
  write.csv2(df_names_two, file = paste0("debugging\\bl\\", names, ".csv"), row.names = FALSE)
  
  sum_work_norm_one = df_names_one %>% pull(work_norm) %>% sum(na.rm = TRUE)
  sum_work_norm_two = df_names_two %>% pull(work_norm) %>% sum(na.rm = TRUE)
  n_emp_df[n_emp_df$names == names, "n_target_one"] = ifelse(n_cu > 0, sum_work_norm_one * n_cu, sum_work_norm_one * 1)
  n_emp_df[n_emp_df$names == names, "n_target_two"] = ifelse(n_cu > 0, sum_work_norm_two * n_cu, sum_work_norm_two * 1)
}
user438383
  • 5,716
  • 8
  • 28
  • 43
math_ist
  • 69
  • 5
  • 2
    Please share your code. Without it we can't do much. – Benson_YoureFired May 19 '22 at 12:40
  • *I have a code that uses multiple while loops, is there a way i can just use vectorization?* do you see something wrong with that question? Once you answer my question then we will answer your question. If you feel my question lacks information, then that information that you feel it lacks include it in your question – Onyambu May 19 '22 at 12:41
  • The code is a quite long and not very neat which is why I didn´t include it in the question. I was rather asking whether there was a general idea/technique used for these cases. – math_ist May 19 '22 at 12:59
  • @math_ist, the answer to your question is "yes! (probably)". But to get more specific than that, you'll have to give us more information (i.e. the code you're using and your data). Clearly, we're interested in your question and eager to help. Just share a little more detail about your case for a more detailed answer :) – Skaqqs May 19 '22 at 13:20
  • Probably best to share a specific loop with reproducible input data. – s_baldur May 19 '22 at 16:01
  • @Skaqqs I hope this works xD – math_ist May 19 '22 at 20:57
  • @math_ist, I see your code; thanks for sharing it. It looks like there might be some things missing (for example, where is `all_names` defined?). It would also be very helpful for you to include comments in your code that decribe why you are doing what you are doing in each chunk, and a description of the overall goal of the script. As an aside, consider there are stylistic and functional differences between assigning values with `<-` and `=` https://stackoverflow.com/questions/1741820/what-are-the-differences-between-and-assignment-operators-in-r – Skaqqs May 20 '22 at 12:05
  • `lapply` won't really give you a performance increase over a for loop. – user438383 Jun 14 '22 at 09:56

0 Answers0