-1

I have a number of lists containing data frames, and I have code that performs that functions that I want when one of the lists is named. What I would like to do is automate the process so that the functions are performed on each of the lists in my environment (environment also contains other things that I don't want affected). Where I create new objects, I would also like to automate naming them based on the list name. Note that the functions are executed on one variable within each element of the list, rather than on the list or all variables as a whole.

In the code below, M10210102.list is the name of one of the lists. INT/EXT anything is one of the variables contained within the elements of the lists. In the failcode below, namesMCH0list is a list containing only the names of all of the lists. ONT.list is the master list from which all of the smaller lists were created, by splitting ONT.list.

My question: is there a way to efficiently automate the functions? Thanks in advance!

Below are the functions I would like to perform:

PercChangeDiff <- lapply(M10210102.list, function(x) {

  INTdif <- c(NA, diff(x[["INTprice"]]))
  EXTdif <- c(NA, diff(x[["EXTprice"]]))
  INTperc <- (INTdif / x[["INTprice"]]) * 100
  EXTperc <- (EXTdif / x[["EXTprice"]]) * 100
  return(list(x[["WEEK"]], INTperc, EXTperc))
}
)

for(i in seq_along(PercChangeDiff)){
  names(PercChangeDiff[[i]]) <- c("WEEK","INTpercent", "EXTpercent")
}


#removing elements from list if they have fewer than 34 observations 
for (i in rev(seq_along(PercChangeDiff))){
  if (length(PercChangeDiff[[i]][["INTpercent"]]) < 34) (PercChangeDiff[[i] <- NULL)  
}

#removing elements from list if INTprice or EXTprice does not change
for (i in rev(seq_along(PercChangeDiff))){
  if (length(unique(PercChangeDiff[[i]][["INTpercent"]])) < 2) (PercChangeDiff[[i]] <- NULL) 
  if (length(unique(PercChangeDiff[[i]][["EXTpercent"]])) < 2) (PercChangeDiff[[i]] <- NULL) 
}

#############

####### AGGREGATING MEANS PER DATE FOR ALL ARTS WITHIN MCH0 #######

#removing first date 
for (i in seq_along(PercChangeDiff)){
  PercChangeDiff[[i]][["WEEK"]][[1]] <- NA
}

#aggregating means
library(tidyverse)
PercChangeAvg <- map(PercChangeDiff,as_tibble) %>%
  bind_rows %>%
  group_by(WEEK) %>%
  summarize_all(mean)


 PercChangeAvg <- PercChangeAvg[complete.cases(PercChangeAvg), ]

############### CREATING TIME SERIES ####################

#create a list of timeseries values for INT and EXT
timeINTavg <- ts(PercChangeAvg$INTpercent, frequency = 52, start = c(2017, 40), end = c(2018, 23))
timeEXTavg <- ts(PercChangeAvg$EXTpercent, frequency = 52, start = c(2017, 40), end = c(2018, 23))


#applying ccf to list of timeseries data
crossCorrAvg <- ccf(timeINTavg, timeEXTavg, lag.max = 100)

Below is what I have attempted with the first function, unsuccessfully:

for (g in seq_along(namesMCH0list)){
  lapply(get(namesMCH0list[g]), function(x) {

    INTdif <- c(NA, diff(x[["INTprice"]]))
    EXTdif <- c(NA, diff(x[["EXTprice"]]))
    INTperc <- (INTdif / x[["INTprice"]]) * 100
    EXTperc <- (EXTdif / x[["EXTprice"]]) * 100
    return( assign(paste("PercChangeDiff", paste(namesMCH0list[g]), sep = "."), list(x[["WEEK"]], INTperc, EXTperc)))
  }
  )
}

##this does not work ##
for (i in seq_along(ONT.list)){
  x <<- paste(namesMCH0list[i])
  for (g in rev(seq_along(get(x)))){
    if (length((get(x))[[g]][["INTprice"]]) < 20) ((get(x))[[g]] <<- NULL)  
  }
}


for (g in seq_along(ONT.list)){ 
  x <<- paste(namesMCH0list[g])
  lapply(paste(x), function(x) { 
  if (length(get(x)[["INTprice"]]) < 20) (NULL)
})
}


for(w in seq_along(ONT.list)){
  lapply(get(namesMCH0list[g]), function (x) {
    if(length(x[["INTprice"]] < 34 )) (x <- NULL)
  })
  NULL
}

Example of data (one list containing two elements):

 list(structure(list(WEEK = structure(c(17441, 17448, 17455, 17462, 17469, 17476, 17483, 17490, 17497, 17504, 17511, 17518, 17546, 17553, 17560, 17567, 17574, 17581, 17588, 17595, 17602, 17609, 17616, 17623, 17630, 17637, 17644, 17651, 17658, 17665, 17672, 17679, 17686, 17693), class = "Date"), REP_ART_UOM = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "20180929-EA", class = "factor"), MCH_0_CD = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "M10210101", class = "factor"), INTprice = c(3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97, 3.97), EXTprice = c(4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48, 4.48)), row.names = c(931L, 3626L, 6325L, 9021L, 11709L, 14368L, 17008L, 19764L, 22528L, 25193L, 27849L, 30489L, 33126L, 35769L, 38426L, 41141L, 44030L, 46911L, 49770L, 52643L, 55538L, 58423L, 61320L, 64256L, 67195L, 70117L, 73049L, 75982L, 78950L, 81924L, 84886L, 87848L, 90816L, 93778L), class = "data.frame"), 
structure(list(WEEK = structure(c(17441, 17448, 17455, 17462, 
17469, 17476, 17483, 17490, 17497, 17504, 17511, 17518, 17546, 
17553, 17560, 17567, 17574, 17581, 17588, 17595, 17602, 17609, 
17616, 17623, 17630, 17637, 17644, 17651, 17658, 17665, 17672, 
17679, 17686, 17693), class = "Date"), REP_ART_UOM = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = "20323113-EA", class = "factor"), MCH_0_CD = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = "M10210101", class = "factor"), INTprice = c(3.47, 
3.47, 3.47, 3.47, 3.47, 3.47, 3.47, 3.47, 3.47, 3.47, 3.47, 
3.47, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2.47, 2.47, 2.47, 
2.47, 3.47, 3.47, 3.47, 3.47, 3.47, 3.47, 3.47), EXTprice = c(2, 
2, 3.37, 3.37, 3.37, 3.37, 3.37, 3.37, 3.37, 2, 2, 2, 2, 
2.03, 2, 2, 3.37, 3.37, 3.37, 3.37, 3.37, 3.37, 2, 2, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2)), row.names = c(1138L, 3834L, 
6537L, 9232L, 11917L, 14547L, 17199L, 19956L, 22718L, 25381L, 
28036L, 30673L, 33312L, 35955L, 38609L, 41357L, 44247L, 47124L, 
49984L, 52859L, 55752L, 58636L, 61536L, 64470L, 67408L, 70330L, 
73262L, 76204L, 79171L, 82147L, 85107L, 88068L, 91035L, 93994L
), class = "data.frame"))
Zuzanna
  • 7
  • 3
  • 2
    When asking for help, you should include a simple [reproducible example](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) with sample input and desired output that can be used to test and verify possible solutions. – MrFlick Jun 25 '18 at 14:39
  • There is some misspellings and inconsistencies. You never return *INTdif*, *EXTdif*, or *INTprice* and *EXTprice*. And at end *crossCorr* is never assigned, so likely *crossCorrAvg*? – Parfait Jun 25 '18 at 15:13
  • @MrFlick Given that I have a number of large lists, what would you recommend reproducing? As the FAQ says, `dput()` output can be unwieldy. – Zuzanna Jun 25 '18 at 15:23
  • Make a small example that only has two object or something. We don't need to see your "real" data. We just need something to test with, and if it works on the test, it should work on your real data. – MrFlick Jun 25 '18 at 15:25
  • @Parfait INTdif/EXTdif don't need to be returned, they're used to calculate INTperc/EXTperc (I realize that might not be the most efficient way to code, but it works). INTprice and EXT price are existing variables within the elements of the list. You're right about crossCorr, that's a remnant of previous code that was never adapted to the newest version, it is supposed to be crossCorr Avg. – Zuzanna Jun 25 '18 at 15:25
  • *INTprice* and *EXTprice* need to be returned when used later in removal. – Parfait Jun 25 '18 at 15:30
  • @Parfait I see what you're referring to. Probably another remnant of the old code. That should be INT/EXTpercent – Zuzanna Jun 25 '18 at 15:36
  • But you do *INT/EXTpercent* removal just before it! See my answer, where I add them in `lapply` return. – Parfait Jun 25 '18 at 15:38

1 Answers1

0

Consider working with an inner dataframe and run filtering from there. And to fully automate, generalize your process in a master function returning last object, crossCorrAvg.

master_function_process <- function(currlist) {
    PercChangeDiff <- lapply(currlist, function(x) {  
        # NEW DATAFRAME COLUMNS 
        x$INTdif <- c(NA, diff(x$INTprice))
        x$EXTdif <- c(NA, diff(x$EXTprice))
        x$INTpercent <- (x$INTdif / x$INTprice) * 100
        x$EXTpercent <- (x$EXTdif / x$EXTprice) * 100

        # DATAFRAME SUBSETTED COLUMNS
        tmp <- x[c("WEEK", "INTprice", "EXTprice", "INTpercent", "EXTpercent")]

        # FILTERS
        tmp <- tmp[tmp$INTpercent >= 34,]
        tmp <- tmp[tmp$INTprice >= 2 | tmp$EXTprice >= 2,]

        # REMOVE FIRST DATE
        tmp$WEEK[[1]] <- NA
        return(tmp)
   })

   # AVERAGE AGGREGATION BY WEEK (NO map)
   PercChangeAvg <- bind_rows(PercChangeDiff) %>%
        group_by(WEEK) %>%
        summarize_all(mean) %>%
        filter(complete.cases(.))

   # TIME SERIES VALUES FOR INT AND EXT
   timeINTavg <- ts(PercChangeAvg$INTpercent, frequency = 52, start = c(2017, 40), end = c(2018, 23))
   timeEXTavg <- ts(PercChangeAvg$EXTpercent, frequency = 52, start = c(2017, 40), end = c(2018, 23))

  # APPLY CCF AND ADD snames
  crossCorrAvg <- ccf(timeINTavg, timeEXTavg, lag.max = 100)
  crossCorrAvg <- lapply(crossCorrAvg, function(i) within(i, snames <- names(i))
}

# PROCESS ALL LISTS RETRIEVED WITH mget
master_crossCorrAvg_list <- lapply(mget(namesMCH0list), master_function_process) 
Parfait
  • 104,375
  • 17
  • 94
  • 125
  • Thank you for writing this out. It produces two plots, however, which is strange. Is there any way to create individual PercChangeAvg lists for each existing list, so that I can refer to them if needed? – Zuzanna Jun 25 '18 at 16:48
  • Add a return line at end `return(PercChangeAvg)` or list return, `return(list(PercChangeAvg, crossCorrAvg)`. By default, functions return last line (here being *crossCorrAvg*). Not sure where plots come in. Be sure to start from a clean/empty R session. – Parfait Jun 25 '18 at 17:01