1

I've been stuck on this loop for a while now (as seen by my question history), but I think I'm getting close to fixing it, thanks a lot to the help I've gotten on stack overflow.

I noticed that in my plots, every plot uses data_percentage_list[391], the last element in the list. I've done a bunch of things to try to stop that from occurring, but using the below code:

# Create graphs in list

# Create titles for plots
titlenames <- c(harps)

 for (i in 1:length(harps)){

counts <- table(Y[[i]][[5]], Y[[i]][[3]])
nam <- paste("data_percentage_", i, sep ="")
assign(nam, apply(counts, 2, function(x){x*100/sum(x,na.rm=T)}))
 }

data_percentage_list <- lapply(paste0("data_percentage_",1:length(harps)), get)

# Create pdf of score breakdown
for (i in 1:length(harps)){ for(j in titlenames) {

# For Hotel Name Subtitle
hotelname <- hotel_report$`Hotel (Q15 1)`[hotel_report$`Harp Number`==j]

# Plot the Data 

pdf(file = paste0(j, ".pdf"), paper = "USr", width=8, height=7)
par(mar = c(5.1, 7, 4.1, 2.1))
nam <- paste("breakdown_", i, sep ="")
assign(nam, barplot(data_percentage_list[[i]], main = "Breakdown of Property Score Distribution", sub = hotelname, 
        col = coul, las = 1, cex.names = .6, horiz = TRUE, yaxs="i", xlab = "Percentage",
        cex.axis = .8, cex.lab = .8, cex.main = .8, cex.sub = .8))
dev.off()
}}

where length(harps) is 391, so there are 391 plots, the plots are overwriting as they go. So when I open a plot in pdf and refresh it, it's changing to the last iteration of the loop until at the end they all end up being the 391st property's data, with the correct hotel name since that's pulled from j.

Does anyone know how I need to alter my code to get each plot to correspond to the correct data? Meaning, breakdown_54 should use data_percentage_list[54], and save as a pdf of that data, breakdown_55 should be data_percentage_list[55], and so on?

Thank you!

Edit: Following up after working on it some more.

The code below makes 391 different graphs, but each of the 391 pdfs has all 391 graphs instead of just their own respective graph like they should.

Is it easier to split up these pdfs correctly in this code versus fixing the code above?

# Create graphs in list

# Create titles for plots
titlenames <- c(harps)

 for (i in 1:length(harps)){

counts <- table(Y[[i]][[5]], Y[[i]][[3]])
nam <- paste("data_percentage_", i, sep ="")
assign(nam, apply(counts, 2, function(x){x*100/sum(x,na.rm=T)}))
 }

data_percentage_list <- lapply(paste0("data_percentage_",1:length(harps)), get)

# Create pdf of score breakdown
for (i in 1:length(harps)){ for(j in titlenames) {

# For Hotel Name Subtitle
hotelname <- hotel_report$`Hotel (Q15 1)`[hotel_report$`Harp Number`==j]

# Plot the Data 

pdf(file = paste0(j, ".pdf"), paper = "USr", width=8, height=7)
par(mar = c(5.1, 7, 4.1, 2.1))
nam <- paste("breakdown_", i, sep ="")
breakdown_list <- lapply(1:length(harps), function(i){
assign(nam, barplot(data_percentage_list[[i]], main = "Breakdown of Property Score Distribution", sub = hotelname, 
        col = coul, las = 1, cex.names = .6, horiz = TRUE, yaxs="i", xlab = "Percentage",
        cex.axis = .8, cex.lab = .8, cex.main = .8, cex.sub = .8))})
dev.off()
}}

Thanks again!

Edit 2: In attempt to make this more reproducible

Y is a list of 391 dataframes enter image description here And below in code is dput of one of the 391 dataframes in Y.

structure(list(`Hotel (Q15 1)` = c("HILTON, SAN PEDRO, BELIZE", 
"HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", 
"HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", 
"HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", 
"HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", 
"HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", 
"HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", 
"HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", "HILTON, SAN PEDRO, BELIZE", 
"HILTON, SAN PEDRO, BELIZE"), `Metro Area State (Q10 1)` = c("OCONUS", 
"OCONUS", "OCONUS", "OCONUS", "OCONUS", "OCONUS", "OCONUS", "OCONUS", 
"OCONUS", "OCONUS", "OCONUS", "OCONUS", "OCONUS", "OCONUS", "OCONUS", 
"OCONUS", "OCONUS", "OCONUS", "OCONUS", "OCONUS", "OCONUS", "OCONUS", 
"OCONUS"), `Question ID` = c("Room Work Area", "Staff Knowledge", 
"Add'tl Item Working Order", "Property Maintenance", "Property Appearance", 
"Staff Knowledge", "Property Appearance", "Staff Interaction", 
"Safety/Security", "Add'tl Item Working Order", "Room Work Area", 
"Bed Quality", "Check In/Out", "Invoice Accuracy", "Staff Interaction", 
"Safety/Security", "Bed Quality", "Invoice Accuracy", "Check In/Out", 
"Safety/Security", "Invoice Accuracy", "Bed Quality", "Property Maintenance"
), `Question ID (group)` = c("Question 4 Items", "Question 4 Items", 
"Question 4 Items", "Question 4 Items", "Question 4 Items", "Question 4 Items", 
"Question 4 Items", "Question 4 Items", "Question 4 Items", "Question 4 Items", 
"Question 4 Items", "Question 4 Items", "Question 4 Items", "Question 4 Items", 
"Question 4 Items", "Question 4 Items", "Question 4 Items", "Question 4 Items", 
"Question 4 Items", "Question 4 Items", "Question 4 Items", "Question 4 Items", 
"Question 4 Items"), `Score Label` = c("7 Extremely Good", "7 Extremely Good", 
"7 Extremely Good", "7 Extremely Good", "7 Extremely Good", "6 Quite Good", 
"6 Quite Good", "6 Quite Good", "6 Quite Good", "6 Quite Good", 
"6 Quite Good", "6 Quite Good", "6 Quite Good", "7 Extremely Good", 
"7 Extremely Good", "5 Slightly Good", "7 Extremely Good", "6 Quite Good", 
"7 Extremely Good", "7 Extremely Good", "3 Slightly Poor", "5 Slightly Good", 
"6 Quite Good"), `Harp Number` = c("1111", "1111", "1111", "1111", 
"1111", "1111", "1111", "1111", "1111", "1111", "1111", "1111", 
"1111", "1111", "1111", "1111", "1111", "1111", "1111", "1111", 
"1111", "1111", "1111")), row.names = c(9380L, 9381L, 9383L, 
9384L, 9385L, 9387L, 9388L, 9389L, 9390L, 9391L, 9392L, 9393L, 
9394L, 9395L, 9396L, 9399L, 9402L, 9403L, 9404L, 9405L, 9407L, 
9408L, 9411L), class = "data.frame")

And below, is dput(harps)

dput(harps)
c("1111", "1696", "3279", "5646", "5724", "5938", "6887", "8859", 
"9368", "9508", "11569", "11644", "18661", "21418", "22460", 
"23317", "25755", "26076", "26336", "28917", "29497", "29498", 
"30465", "30619", "30629", "32784", "35578", "35588", "40390", 
"40866", "47493", "47677", "47866", "48064", "48294", "50432", 
"50667", "50773", "51857", "52125", "52146", "52383", "52432", 
"52451", "52755", "53589", "53620", "56939", "57784", "59571", 
"61276", "61283", "62329", "62666", "66058", "66553", "66741", 
"66763", "67092", "67169", "67214", "67373", "67840", "69494", 
"71343", "73906", "74550", "75285", "76253", "76335", "76361", 
"76393", "76396", "76898", "76949", "78501", "78800", "80079", 
"81035", "81620", "85043", "87026", "87219", "87304", "88683", 
"89650", "92759", "94380", "94427", "95043", "95255", "96061", 
"96677", "97269", "100135", "109591", "109743", "109971", "110414", 
"110856", "110884", "110899", "110926", "111032", "111384", "111605", 
"123136", "123411", "124380", "124753", "124848", "127565", "135185", 
"135999", "136005", "138251", "140027", "140074", "140091", "140095", 
"140159", "145523", "148284", "149639", "153676", "154790", "157239", 
"158213", "158259", "159248", "159343", "159401", "159842", "161219", 
"161725", "163154", "163653", "167172", "170199", "171936", "172095", 
"172272", "172273", "172340", "172868", "173429", "173816", "175033", 
"177012", "177150", "177361", "177383", "177692", "177892", "177965", 
"179887", "180495", "182189", "182979", "183174", "183717", "183879", 
"184076", "185191", "185341", "185675", "185961", "189276", "190279", 
"190896", "192388", "192984", "193387", "193441", "193526", "193534", 
"193605", "193613", "193614", "194274", "194794", "196133", "196546", 
"197075", "197647", "198115", "200996", "201627", "202124", "202992", 
"205802", "206405", "206880", "206990", "207423", "207483", "207723", 
"208210", "208943", "209614", "210006", "211605", "211985", "212714", 
"213707", "213803", "213842", "215961", "216533", "217963", "218029", 
"218348", "218376", "221745", "222179", "222299", "222399", "222736", 
"222882", "224539", "224624", "225339", "225346", "225368", "225553", 
"225565", "225572", "225573", "226003", "228325", "229582", "229614", 
"230871", "231228", "231402", "235196", "235538", "239409", "241353", 
"244587", "244654", "245353", "246093", "246311", "247209", "251084", 
"253732", "254388", "256996", "258464", "260958", "261655", "262754", 
"263192", "263444", "265835", "269872", "270285", "271683", "271687", 
"272664", "275922", "276312", "279909", "287731", "291167", "291988", 
"296004", "297975", "298318", "298401", "300962", "301940", "302250", 
"302702", "304896", "308049", "311490", "312027", "313227", "313603", 
"315536", "319957", "320049", "320270", "320352", "327521", "330319", 
"331054", "332070", "332426", "334213", "341876", "345820", "346263", 
"346723", "347340", "352596", "354486", "396465", "445549", "473263", 
"482701", "496665", "503123", "503365", "528259", "538396", "539834", 
"540896", "546228", "546290", "546652", "546922", "548916", "550479", 
"552466", "709416", "714793", "714861", "716337", "719021", "728913", 
"731082", "732346", "733242", "735165", "735348", "735473", "749296", 
"757777", "761782", "762104", "770251", "808540", "809896", "809951", 
"812527", "816275", "837926", "842678", "843836", "847737", "857277", 
"864044", "864495", "865468", "865951", "866108", "866502", "866547", 
"867803", "867809", "868374", "868420", "868593", "868793", "869746", 
"869748", "870953", "872490", "872579", "875200", "875288", "878016", 
"878858", "879328", "879640", "882643", "882781", "883894", "886067", 
"886876", "888522", "888560", "888820", "889693", "890261", "890264", 
"891171", "894931", "896794", "896840", "899485", "901218", "903465", 
"904381", "912517", "913354", "918968", "921083")
Jasmine B
  • 53
  • 1
  • 6
  • I think the problem is that you have 2 for loops in your pdf section (See the "for (i in 1:length(harps)){ for(j in titlenames) {" line of code). So for each harp i, you are iterating through the entire set of titles. The pdf for each harp is then being associated with the last title each time. If you reduce this down to a single loop it will work better for you. – olorcain Sep 22 '20 at 13:05
  • Hi @olorcain, thanks! I tried to reduce to a single loop but then it gave me a too many open devices error I can't fix. – Jasmine B Sep 22 '20 at 14:22
  • In your recent question history of which none have official answers, you do not provide sample data except one [small screenshot](https://i.stack.imgur.com/lRnmf.png) which we cannot easily import. Please provide a `dput` of `Y` and show `harps`. All posted code should be runnable from empty R environment. Some of your coding lines here can be tightened without use of `assign` and `get` and multiple `for` loops. See [How to make a great R reproducible example](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example/5965451). We love data and will answer you! – Parfait Sep 22 '20 at 16:32
  • @Parfait, thanks! I'd never heard of dput before. Edited again so it's hopefully more reproducible. Couldn't put dput of all Y in there because of privacy issues, but put dput of an edited version of one of Y's elements. – Jasmine B Sep 22 '20 at 17:55

1 Answers1

0

Consider the following general tips in R and maybe even programming:

  • Variables: Avoid use of too many variables but interact directly on existing objects. This enhances the maintainability of environment variables. Some examples of redundancy include:

    titlenames <- c(harps)
    nam <- paste("data_percentage_", i, sep ="")
    data_percentage_list <- lapply(paste0("data_percentage_",1:length(harps)), get)
    
  • Names: Use more informative names for objects as Y does not inform code readers or yourself in the future. It appears to be a list that contains subsets of larger data frame hotel_report. More informative names like hotel_reports_df_list quickly detail its contents and type (i.e., data frames within a list).

  • Indentation: Always indent code in for loops (which can be automated in RStudio with keys: Ctrl/cmd + i) and even inside context managers like pdf, with, etc. This enhances readability and maintainability.

  • Assign/Get: Avoid assign and get which usually are not recommended in R. Instead, directly save your objects as items in lists. First loop can bypass the need to assign child items as separate variables:

    data_pct_matrix_list <- lapply(seq_along(harps), function(i) {
         counts <- table(Y[[i]][[5]], Y[[i]][[3]])
         pct_matrix <- apply(counts, 2, function(x) { x*100/sum(x, na.rm=TRUE)}) 
    
         return(pct_matrix)
    })
    

    Also last assign wrapped around barplot can also be refactored:

    plot_list <- lapply(data_percentage_matrix_list, function(mat) {
       barplot(mat, main = "Breakdown of Property Score Distribution", sub = hotelname, 
               col = coul, las = 1, cex.names = .6, horiz = TRUE, yaxs="i", xlab = "Percentage",
               cex.axis = .8, cex.lab = .8, cex.main = .8, cex.sub = .8))
    })
    
  • Loops: Avoid multiple for or nested loops as much as possible. In R, lapply is a hidden loop. Your issues of 391 plots in each of the 391 PDFs likely is due to nested lapply within a for loop. Consider these steps:

    1. First, think about your process on one data frame object. Even generalize it in a separate function.
    2. Then, think about what exactly changes that can be iterated.

    R's apply family includes more than just apply and lapply such as mapply that can run elementwise looping to flatten your nested iterations or by (object-oriented wrapper to tapply) that can subset data frames by factor columns and run operations on them.


Without seeing sample data, consider following approaches which will need to be tested against data. Below assumes Y is defined as list of subsets from hotel_report data frame by Harp Number.

mapply / Map approach

Iterate elementwise between equal-length objects, data_pct_matrix_list and harps.

data_pct_matrix_list <- lapply(seq_along(harps), function(i) {
    counts <- table(Y[[i]]$`Score Label`, Y[[i]]$`Question ID`)
    pct_matrix <- apply(counts, 2, function(x) { x*100/sum(x, na.rm=TRUE) }) 

    return(pct_matrix)
})

build_pdf <- function(data, harp) {
    # For Hotel Name Subtitle
    hotelname <- hotel_report$`Hotel (Q15 1)`[hotel_report$`Harp Number` == harp]

    # Plot the Data 
    pdf(file = paste0(harp, ".pdf"), paper = "USr", width=8, height=7)
        par(mar = c(5.1, 7, 4.1, 2.1))
        
        hotel_plot <- barplot(data, main = "Breakdown of Property Score Distribution", sub = hotelname, 
                              col = coul, las = 1, cex.names = .6, horiz = TRUE, yaxs="i", xlab = "Percentage",
                              cex.axis = .8, cex.lab = .8, cex.main = .8, cex.sub = .8))
    dev.off()
    
    return(hotel_plot)
}

plot_list <- Map(build_pdf, data_pct_matrix_list, harps)

# EQUIVALENTLY:
plot_list <- mapply(build_pdf, data_pct_matrix_list, harps, SIMPLIFY=FALSE)  

by approach

Subset hotel_report data frame by unique Harp Number and iteratively run on each subset to build pct_matrix and hotel_plot. This approach combines matrix build and plot steps.

build_pdf <- function(sub_df) {
    # Matrix build
    counts <- table(sub_df$`Score Label`, sub_df$`Question ID`)
    pct_matrix <- apply(counts, 2, function(x) { x*100/sum(x, na.rm=TRUE) }) 

    # For Hotel Name Subtitle
    hotelname <- sub_df$`Hotel (Q15 1)`[1]
    harp <- sub_df$`Harp Number`[1]

    # Plot the Data 
    pdf(file = paste0(harp, ".pdf"), paper = "USr", width=8, height=7)
        par(mar = c(5.1, 7, 4.1, 2.1))
        
        hotel_plot <- barplot(pct_matrix, main = "Breakdown of Property Score Distribution", sub = hotelname, 
                              col = coul, las = 1, cex.names = .6, horiz = TRUE, yaxs="i", xlab = "Percentage",
                              cex.axis = .8, cex.lab = .8, cex.main = .8, cex.sub = .8))
    dev.off()
    
    return(hotel_plot)
}

plot_list <- by(hotel_report, hotel_report$`Harp Number`, build_pdf)

# NEAR EQUIVALENT
plot_list <- lapply(split(hotel_report, hotel_report$`Harp Number`), build_pdf) 
Parfait
  • 104,375
  • 17
  • 94
  • 125
  • thank you so much! That mapply approach worked perfectly! I never would have figured that out. And thank you for all the programming tips! I'll definitely go back through my code and fix it up, because it's all a mess. – Jasmine B Sep 22 '20 at 18:49