1

I am trying to convert the for loop into any apply family for code optimization

Here is the sample data

my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
    "BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
    "CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
    "Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
    -8L))

Actual Function (Using for loop) this function gives us the expected output

expand_collapse_compliance <- function(right_table){
  
  sector_list <- unique(right_table$Sector)
  df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
  
  for(s in sector_list){
    df1 = right_table[right_table$Sector==s,]
    sector1 = df1$Sector[1]
    Sector = df1$Sector[1]
    Sub_Sector = ""
    actual = as.character(nrow(df1))
    mainrow = c(sector1,Sector,Sub_Sector,actual)
    df = rbind(df,mainrow)
    Sub_Sector_list <- unique(df1$Sub_Sector)
    
    for(i in Sub_Sector_list){
      df2 = right_table[right_table$Sub_Sector==i,]
      sector1 = df1$Sector[1]
      Sector = ""
      Sub_Sector = df2$Sub_Sector[1]
      actual = nrow(df2)
      subrow = c(sector1,Sector,Sub_Sector,actual)
      df = rbind(df,subrow)
    }
  }
  df = df[2:nrow(df),]
  df$Actual = as.numeric(df$Actual)
  df_total = nrow(right_table)
  df = rbind(df,c("","Total","",df_total))
  return(df)
  
}

DT::datatable(expand_collapse_compliance(mydata1), 
              rownames = F,escape = FALSE,
              selection=list(mode="single",target="row"),
              options = list(pageLength = 50,scrollX = TRUE,
                             dom = 'tp',ordering=F,
                             columnDefs = list(list(visible=FALSE, targets=0),
              list(className = 'dt-left', targets = '_all'))),class='hover cell-border stripe')

i tried to convert inner loop to lapply first while doing that the sub_sector value is not showing in the output table, please let me know how to fix any idea would be appreciated

expand_collapse_compliance <- function(right_table){
  sector_list <- unique(right_table$Sector)
  df = data.frame("Sector1"=c(""),"Sector"=c(""),"Sub_Sector"=c(""),"Actual"=c(""))
  
  for(s in sector_list){
    df1 = right_table[right_table$Sector==s,]
    sector1 = df1$Sector[1]
    Sector = df1$Sector[1]
    Sub_Sector = ""
    actual = as.character(nrow(df1))
    mainrow = c(sector1,Sector,Sub_Sector,actual)
    df = rbind(df,mainrow)
    Sub_Sector_list <- unique(df1$Sub_Sector)
    
    #for(i in Sub_Sector_list){
      lapply(Sub_Sector_list, function(x){
      df2 = right_table[right_table$Sub_Sector==Sub_Sector_list,]
      sector1 = df1$Sector[1]
      Sector = ""
      Sub_Sector = df2$Sub_Sector[1]
      actual = nrow(df2)
      subrow = c(sector1,Sector,Sub_Sector,actual)
      df = rbind(df,subrow)
      })
  }
  df = df[2:nrow(df),]
  df$Actual = as.numeric(df$Actual)
  df_total = nrow(right_table)
  df = rbind(df,c("","Total","",df_total))
  return(df)
  
}
Eliyas
  • 15
  • 7

3 Answers3

2

Using dplyr and tidyr you could do:

Note: I dropped the DT part.

library(dplyr)
library(tidyr)

expand_collapse_compliance1 <- function(x) {
  x <- x %>% 
    count(Sector, Sub_Sector, name = "Actual") %>% 
    group_split(Sector) %>% 
    lapply(function(x) {
      main <- group_by(x, Sector) %>% summarise(Actual = sum(Actual)) 
      bind_rows(main, x)
    }) %>%
    bind_rows() %>% 
    mutate(Sector1 = Sector) %>%
    select(Sector1, Sector, Sub_Sector, Actual)
  
  total <- x %>%
    filter(is.na(Sub_Sector)) %>% 
    group_by(Sector = "Total") %>% 
    summarise(Actual = sum(Actual))
  
  bind_rows(x, total) %>% 
    mutate(Sector = ifelse(!is.na(Sub_Sector), "", Sector)) %>% 
    replace_na(list(Sub_Sector = "", Sector1 = ""))
}

expand_collapse_compliance1(my_data)
#> # A tibble: 9 × 4
#>   Sector1 Sector  Sub_Sector Actual
#>   <chr>   <chr>   <chr>       <int>
#> 1 "AAA"   "AAA"   ""              4
#> 2 "AAA"   ""      "AAA1"          4
#> 3 "BBB"   "BBB"   ""              2
#> 4 "BBB"   ""      "BBB1"          1
#> 5 "BBB"   ""      "BBB2"          1
#> 6 "CCC"   "CCC"   ""              2
#> 7 "CCC"   ""      "CCC1"          1
#> 8 "CCC"   ""      "CCC2"          1
#> 9 ""      "Total" ""              8

expand_collapse_compliance(my_data)
#>    Sector1 Sector Sub_Sector Actual
#> 2      AAA    AAA                 4
#> 3      AAA              AAA1      4
#> 4      BBB    BBB                 2
#> 5      BBB              BBB1      1
#> 6      BBB              BBB2      1
#> 7      CCC    CCC                 2
#> 8      CCC              CCC1      1
#> 9      CCC              CCC2      1
#> 91          Total                 8
stefan
  • 90,330
  • 6
  • 25
  • 51
  • 1
    Thanks you so much stefan – Eliyas Dec 23 '21 at 13:21
  • I checked your code in the [benchmark in my answer](https://stackoverflow.com/a/70461290/6574038), probably something in it is not working properly yet? Or I have a flaw in my new sample data. – jay.sf Dec 23 '21 at 16:09
  • 1
    Hej @jay.sf. Yep. In my tidy approach the `Sub_Sector`s are ordered alphabetically, i.e. with your example data it results in `A1`, `A10`, ... instead of `A1`, `A2`, ... Moreover, in my tidy approach `Actual` is a numeric while running your function it is character. – stefan Dec 23 '21 at 17:12
  • https://stackoverflow.com/questions/70473732/for-loop-conversation-to-apply-variants-in-r-for-code-optimization <- i have a similar question with multiple columns kindly check and let me know your suggestion – Eliyas Dec 24 '21 at 14:19
1

The appropriate function of the *apply family could be tapply using a split-apply-combine approach. Since we need tapply only when there are multiple Sub_Sector's, we implement a case handling for sake of speed.

expand_collapse_complianceA <- \(data) {
  r <- do.call(rbind, c(by(data, data$Sector, \(x) {
    if (length(unique(x$Sub_Sector)) != 1L) {
      tt <- t(unname(with(x, tapply(count, list(Sector, Sub_Sector), sum))))
      tt <- cbind(x[!duplicated(x$Sub_Sector), 1:2], foo='', Actual=tt)
    } else {
      tt <- as.data.frame(t(c(unlist(x[!duplicated(x$Sub_Sector), 1:2]), foo='',
                              Actual=sum(x$count))))
    }
    rbind(c(tt[1, 1], '', tt[1, 1], sum(as.numeric(tt[, 4]))), tt)[c(1, 3, 2, 4)]
  }), make.row.names=FALSE))
  rbind(r, c('', 'Total', '', sum(as.numeric(r$Actual[!r$foo %in% ''])))) |>
    setNames(c('Sector1', 'Sector', 'Sub_Sector', 'Actual'))
}

Note: R version 4.1.2 (2021-11-01).

Gives

expand_collapse_compliance(my_data)
#   Sector1 Sector Sub_Sector Actual
# 1     AAA    AAA                 4
# 2     AAA              AAA1      4
# 3     BBB    BBB                 2
# 4     BBB              BBB1      1
# 5     BBB              BBB2      1
# 6     CCC    CCC                 2
# 7     CCC              CCC1      1
# 8     CCC              CCC2      1
# 9          Total                 8


expand_collapse_complianceA(my_data) |> 
  (\(x) DT::datatable(
    x, rownames=F, escape=FALSE, selection=list(mode="single", target="row"), 
    options=list(pageLength=50, scrollX=TRUE, dom='tp', ordering=F, 
                 columnDefs=list(list(visible=FALSE, targets=0),
                                 list(className='dt-left', targets='_all'))), 
    class='hover cell-border stripe'))()

enter image description here

expand_collapse_complianceA now needs just 1/10 of the time as the original for loop. Here a benchmark (tested on 1080 rows).

# Unit: milliseconds
#       expr        min         lq       mean     median         uq       max neval cld
#    ecc_for 304.723781 305.426934 346.878188 308.208294 335.944407 598.94351    10   c
# ecc_tapply  29.768177  29.851975  31.083977  30.611982  32.058980  34.50901    10 a  
#   ecc_tidy 135.326594 135.952068 143.967550 138.475437 149.352409 164.94652    10  b 
#     ecc_DT   3.267969   3.611711   4.610916   3.664493   3.707528  13.48797    10 a  

Of course data.table is faster. However, I's like to see performance when the data is about to exhaust the RAM.

Benchmark Code:

microbenchmark::microbenchmark(
  ecc_for=expand_collapse_compliance(dat),
  ecc_tapply=expand_collapse_complianceA(dat),
  ecc_tidy={library(dplyr);library(tidyr);expand_collapse_compliance1(dat)},
  ecc_DT={library(data.table);expand_collapse_complianceDT(as.data.table(dat))},
  times=10L)

Note, that the "tidy" version has some flaws so far (at least with the new data).

res_for <- expand_collapse_compliance(dat)
res_tapply <- expand_collapse_complianceA(dat)
res_tidy <- {library(dplyr);library(tidyr);expand_collapse_compliance1(dat)}


all.equal(res_for, res_tapply, check.attributes=FALSE)
# [1] TRUE
all.equal(res_for, res_tidy, check.attributes=FALSE)
# [1] "Component “Sub_Sector”: 1053 string mismatches"             
# [2] "Component “Actual”: target is character, current is numeric"

Data

dat <- expand.grid(Sector=c("AA", "AB", "AC", "AD", "AE", "AF", "AG", "AH", "AI", "AJ", 
                     "AK", "AL", "AM", "AN", "AO", "AP", "AQ", "AR", "AS", "AT", "AU", 
                     "AV", "AW", "AX", "AY", "AZ", "BA"),
            Sub_Sector=1:40, stringsAsFactors=F)
dat <- transform(dat, Sub_Sector=Reduce(paste0, dat[1:2]), count=1, type='Actual')
dat <- dat[order(dat$Sector), ]
jay.sf
  • 60,139
  • 8
  • 53
  • 110
  • Thanks a lot @jay.sf – Eliyas Dec 23 '21 at 13:22
  • @Eliyas You're welcome, note the update with enhanced version of your `for` loop. – jay.sf Dec 23 '21 at 13:57
  • 1
    Be carefull though on such speedtests on very tiny sample data. Often `for` loops look faster as any other library's function calls take a tiny tiny bit of extra time ONCE but that you win back HUGE time processing some actual data. – Merijn van Tilborg Dec 23 '21 at 14:39
  • can you tell how to check the differences @Jay.sf – Eliyas Dec 23 '21 at 14:41
  • Let me post some test below my answer. – Merijn van Tilborg Dec 23 '21 at 14:43
  • i think you forget to define sector_list in the code – Eliyas Dec 23 '21 at 14:43
  • df <- type.convert(as.data.frame(M), as.is=TRUE) |> setNames(c('Sector1', 'Sector', 'Sub_Sector', 'count')) i am getting an error in the line – Eliyas Dec 23 '21 at 14:50
  • @Eliyas Thanks, `sector_list` replaced. You probably have an old R version, you need R > 4.1 for the `|> ` pipe. – jay.sf Dec 23 '21 at 15:03
  • @Eliyas Actually I think your original function performs well, what is the reason you want to change it? – jay.sf Dec 23 '21 at 15:05
  • can you tell me how to check the time differences @jay.sf – Eliyas Dec 23 '21 at 15:07
  • @Eliyas Yes, see update. We don't have appropriate data yet, though. – jay.sf Dec 23 '21 at 15:10
  • @Eliyas Please note, I've corrected a flaw in `expand_collapse_complianceA()`, it also added the Actual of Sector to the Total. – jay.sf Dec 23 '21 at 15:30
  • sure and thanks for everything @jay.sf – Eliyas Dec 23 '21 at 15:35
  • @Eliyas Forget the new for loop, actually your idea with the `*apply` family was great and `tapply` actually is fastest, see updated benchmark. – jay.sf Dec 23 '21 at 15:40
  • 1
    @MerijnvanTilborg Thanks for the objection, you were really right! I created some better testing data and rerun the benchmark! – jay.sf Dec 23 '21 at 15:42
  • @jay.sf it is usually not fair either to include your library calls **within** your benchmark – Merijn van Tilborg Dec 23 '21 at 15:47
  • @MerijnvanTilborg I have the different opinion, that everything necessary should be included (apart from R), at least to see both versions. It's similar when parallelizing code, the time to start the clusters, or in `for` loops to define objects before the loop; actually _not_ including all this could also be seen as cheating. – jay.sf Dec 23 '21 at 15:52
  • Works two ways, usually people have either tidiverse or data.table as their way to go. You just have them loaded anyhow. Sure sometimes you prefer base for certain tasks. I always use data.table by personal preference and it loads in .Rprofile. No need to punish every single piece of code by the libraries. So if you benchmark 2 functions using one library? Do you then share the load? – Merijn van Tilborg Dec 23 '21 at 15:58
  • @MerijnvanTilborg I haven't loaded any of both, though, just clutters my auto completion :) – jay.sf Dec 23 '21 at 16:00
  • "Of course data.table is faster. However, I's like to see performance when the data is about to exhaust the RAM." You probably gonna love data.table then even more as it tends to be way more memory efficient than others. – Merijn van Tilborg Dec 23 '21 at 16:03
  • @MerijnvanTilborg I just put a link under it, did you notice? – jay.sf Dec 23 '21 at 16:14
  • Yes have seen it, but we are not dcasting here ;) It is always good to know the strength and weakness of any library and its functions. Anyhow this gets a bit too much off topic though. – Merijn van Tilborg Dec 23 '21 at 16:17
  • @MerijnvanTilborg You made a point, maybe you should write a new article ;) – jay.sf Dec 23 '21 at 16:20
  • https://stackoverflow.com/questions/70473732/for-loop-conversation-to-apply-variants-in-r-for-code-optimization <- i have a similar question with multiple columns kindly check and let me know your suggestion – Eliyas Dec 24 '21 at 14:21
1

There is no need for any loops nor apply, what we want here is three different group counts and some formatting. With the assumption - and as seen in the sample data - there is no need for a split.

my_data = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
    "BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
    "CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
    "Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
    -8L))

library(data.table)
setDT(my_data)

expand_collapse_compliance <- function(x) {
  x <- rbindlist(list(
    x[, .(Sector1 = Sector, Actual = .N), by = Sector], 
    setnames(x[, .(Actual = .N), by = .(Sector, Sub_Sector)], "Sector", "Sector1"),
    x[, .(Sector = "Total", Actual = .N)]
  ), fill = T)
  setcolorder(x, c("Sector1", "Sector", "Sub_Sector", "Actual"))
  setorder(x, Sector1, Sector, na.last = T)
  x
}

expand_collapse_compliance(my_data)

#    Sector1 Sector Sub_Sector Actual
# 1:     AAA    AAA       <NA>      4
# 2:     AAA   <NA>       AAA1      4
# 3:     BBB    BBB       <NA>      2
# 4:     BBB   <NA>       BBB1      1
# 5:     BBB   <NA>       BBB2      1
# 6:     CCC    CCC       <NA>      2
# 7:     CCC   <NA>       CCC1      1
# 8:     CCC   <NA>       CCC2      1
# 9:    <NA>  Total       <NA>      8

sidenote There is no need to convert NA to "" as in shiny DT will show blancs for it.

Speedtest

As I mentioned that for is often faster on very small data sets as any library solutions use some functions that take some time to load once...

my_data_small = structure(list(Sector = c("AAA", "BBB", "AAA", "CCC", "AAA",
    "BBB", "AAA", "CCC"), Sub_Sector = c("AAA1", "BBB1", "AAA1",
    "CCC1", "AAA1", "BBB2", "AAA1", "CCC2"), count = c(1L, 1L, 1L,
    1L, 1L, 1L, 1L, 1L), type = c("Actual", "Actual", "Actual", "Actual",
    "Actual", "Actual", "Actual", "Actual")), class = "data.frame", row.names = c(NA,
    -8L))

library(data.table)
setDT(my_data)

   test replications elapsed relative
2  eccB          150    0.32     1.00
1 eccDT          150    0.72     2.25

# well just make it a milion times bigger :D
my_data_large <- rbindlist(rep(list(my_data_small), 1000000L))

   test replications elapsed relative
2  eccB           50   79.30    5.146
1 eccDT           50   15.41    1.000
Merijn van Tilborg
  • 5,452
  • 1
  • 7
  • 22