1

I have been trying to change the code from the 'labeling outliers in a boxplot' to get it to work in a loop (of columns in a df).

ens_id=names(mtcars)
for(i in 1:length(ens_id)){
 
 dat <- test %>% tibble::rownames_to_column(var="outlier") %>% group_by(cond) %>% mutate(is_outlier=ifelse(is_outlier(as.numeric(ens_id[i])),as.numeric(ens_id[i]), as.numeric(NA)))
 dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)
 
 p=ggplot(dat, aes_string(y=ens_id[i], x="cond",fill="cond")) + geom_boxplot()  + ylab(gene_id[i])+ geom_text(aes(label=outlier),na.rm=TRUE,nudge_x=0.15)
 ggsave(p, file=paste0("/media/chi/Figures/HVOLvsCDCS/",ens_id[i],".png"), width = 14, height = 10, units = "cm")
 }

Error in quantile.default(x, 0.25) : missing values and NaN's not allowed if 'na.rm' is FALSE In addition: Warning message: In is_outlier(as.numeric(ens_id[i])) : NAs introduced by coercion

I used as.numeric(ens_id[i]) to get around the error:

Error in (1 - h) * qs[i] : non-numeric argument to binary operator

Wai Ha Lee
  • 8,598
  • 83
  • 57
  • 92
zoe
  • 301
  • 3
  • 11
  • sorry @akrun, you beat me to it - I was just about to try to edit my post – zoe Jul 15 '20 at 02:33
  • yes, it should be the df mtcars – zoe Jul 15 '20 at 02:34
  • Thanks, one doubt is that you are grouping by 'cond' and should that be removed from the `ens_id` – akrun Jul 15 '20 at 02:35
  • I think I understand why you are getting that issue. The `outlier` column was not updated. `dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)`. There are two columns `is_outlier` and `outlier`. Please check my update – akrun Jul 15 '20 at 04:15

1 Answers1

1

The issue is that the strings i.e. the column names as strings are not evaluated. An option is to pass the strings directly in across or convert to symbol and evaluate (!!). As the former is more easier, here we show that

library(dplyr) # 1.0.0
library(stringr)
for(i in seq_along(ens_id)) {

      dat <-  test %>%
          tibble::rownames_to_column(var="outlier") %>%
          group_by(cond) %>%
          mutate(across(ens_id[i], ~ replace(., !is_outlier(.), NA), .names = "{col}_is_outlier")) %>%
          # or use mutate_at (if the version is less than 1.0.0
          #mutate_at(vars(ens_id[i]), list(is_outlier = ~replace(., !is_outlier(.), NA))) %>%
          rename_at(vars(ends_with('is_outlier')), ~ str_remove(., str_c(ens_id[i], "_")))
     
      dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)
      print(head(dat))

   }

Or as mentioned above, the second option is to evaluate (!!) after converting to symbol

for(i in seq_along(ens_id)) {
      dat <- test %>%
               tibble::rownames_to_column(var="outlier") %>%
               group_by(cond) %>%
               mutate(is_outlier = replace(!! sym(ens_id[i]), 
                     !is_outlier(!!sym(ens_id[i])), NA)) 
       dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)
       print(head(dat))
   }

Using a reproducible example

ens_id <- c("mpg", "wt")
 test <- mtcars
 test$mpg[10] <- 9800
 test$wt[22] <- 4895
 plist <- vector('list', length(ens_id))
 for(i in seq_along(ens_id)) {

       dat <-  test %>%
           tibble::rownames_to_column(var="outlier") %>%
           group_by(gear) %>%
           mutate(across(ens_id[i], ~ replace(., !is_outlier(.), NA), .names = "{col}_is_outlier")) %>%
           # or use mutate_at (if the version is less than 1.0.0
           #mutate_at(vars(ens_id[i]), list(is_outlier = ~replace(., !is_outlier(.), NA))) %>%
           rename_at(vars(ends_with('is_outlier')), ~ str_remove(., str_c(ens_id[i], "_")))
     
       dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)
       plist[[i]] <- ggplot(dat, aes_string(y=ens_id[i], x="gear", group="gear")) +
                geom_boxplot()  +
                 ylab(ens_id[i])+ 
                 geom_text(aes(label=outlier), na.rm=TRUE, nudge_x=0.15)

    }
    
plist[[1]]
plist[[2]]
akrun
  • 874,273
  • 37
  • 540
  • 662
  • @zoe I used `dplyr 1.0.0`. If you have a version less than 1.0.0, use `mutate_at(vars(ens_id[i]), ~ replace(., !is_outlier(.), NA))` – akrun Jul 15 '20 at 03:15
  • dat isn't getting a new column of dat$outlier? The column of the values to plot in the boxplot is being replaced with 'NA' or the value of the outlier so it doesn't plot as desired. Warning message: Unknown or uninitialised column: 'is_outlier'. – zoe Jul 15 '20 at 03:28
  • @zoe It is from `library(stringr)`. I did updated that in the code – akrun Jul 15 '20 at 03:40
  • thankyou so much. It is now working but it is plotting all of the sample names and not just the outliers? I used: ggplot(dat, aes_string(y=ens_id[i], x="cond",fill="cond")) + geom_boxplot() + ylab(gene_id[i])+ geom_text(aes(label=outlier),na.rm=TRUE,nudge_x=0.15) – zoe Jul 15 '20 at 03:46
  • I need the full data set to plot the boxplots?? – zoe Jul 15 '20 at 03:57
  • @zoe I added with a reproducible example. – akrun Jul 15 '20 at 04:27