I am facing a memory issue with an R script of mine that I cannot understand. In short, the script takes as input a list of paths to data tables and the script builds one big data table (about 10k rows and 2k columns) by aggregating iteratively each input table.
I tried to run locally on my MAC OS (Mojave, Memory: 16 GB 2133 MHz LPDDR3) but I run out of memory before the script ends. However, the complete aggregated table should not exceed 500Mb and I don't have many intermediate variables that would make the memory full.
I used pryr
and profvis
R packages to profile the memory usage alone the script. Here is some of the output
-INFO MEMORY: total memory usage is 223.99 MB
-INFO: adding DNA Mutation counts per pathway from table count_by_pathway_cpad_kegg_DNA_non_synonymous_tcga. tsv (8974,42) ... done!
-INFO: adding DNA Mutation counts per pathway from table count_by_pathway_sanchez_vega_DNA_non_synonymous_tcga.tsv (8974,12) ... done!
-INFO: adding DNA Mutation counts per pathway from table count_by_pathway_msigdb_hallmarks_DNA_non_synonymous_tcga.tsv (8974,52) ... done!
-INFO MEMORY: size of dfs is 81.79 MB
-INFO MEMORY: total memory usage is 253.44 MB
-INFO: adding MUT counts total from table count_total_DNA_all_tcga.tsv (8974,3) ... done!
-INFO MEMORY: size of dfs is 81.97 MB
-INFO MEMORY: total memory usage is 260.74 MB
-INFO: adding DNA Alteration counts per gene from table count_by_gene_DNA_annotated_tcga.tsv (3630,373) ... done!
-INFO MEMORY: size of dfs is 130.17 MB
-INFO MEMORY: total memory usage is 322.26 MB
-INFO: adding DNA Alteration counts per pathway from table count_by_pathway_cpad_kegg_DNA_annotated_tcga.tsv (3630,123) ... done!
-INFO: adding DNA Alteration counts per pathway from table count_by_pathway_sanchez_vega_DNA_annotated_tcga.tsv (3630,24) ... done!
-INFO: adding DNA Alteration counts per pathway from table count_by_pathway_msigdb_hallmarks_DNA_annotated_tcga.tsv (3630,137) ... done!
-INFO MEMORY: size of dfs is 166.64 MB
-INFO MEMORY: total memory usage is 400.82 MB
Here is the code for the last function I called
dfs <- add_counts(args$dna_alt_counts_pathway, dfs, agg="pathway", evt_type="Alteration", data_type="DNA", cohort=args$cohort)
add_counts <- function(filepaths, dfs, agg="gene", data_type="DNA", evt_type="Alteration", cohort="prism"){
dfs_data <- lapply(filepaths, load_table)
if (length(dfs_data)==0) return(dfs)
if (agg %in% c("pathway")){
pattern_1 <- paste0("(?<=count_by_", agg, "_)[a-z0-9A-Z\\_\\-]+(?=_", data_type, ")")
level_names_1 <- sapply(filepaths, function(s) str_extract(s, pattern_1), USE.NAMES=F)
} else {
level_names_1 <- agg
}
pattern_2 <- paste0("(?<=_", data_type, "_)[a-z0-9A-Z\\_\\-]+(?=_", cohort, ")")
level_names_2 <- sapply(filepaths, function(s) str_extract(s, pattern_2), USE.NAMES=F)
col_tsb <- "Tumor_Sample_Barcode"
col_nsb <- "Matched_Norm_Sample_Barcode"
col_psb <- "Sample_Id_DNA_P"
for (i in 1:length(filepaths)){
df_dat_cnt <- dfs_data[[i]]
level_1 <- level_names_1[[i]]
level_2 <- level_names_2[[i]]
dat_size <- paste0("(", nrow(df_dat_cnt), ",", ncol(df_dat_cnt), ")")
cat(paste("-INFO: adding", data_type, evt_type, "counts per", agg, "from table", basename(filepaths[[i]]),
dat_size, "..."))
if (all(c(col_tsb, col_nsb) %in% colnames(df_dat_cnt))){
df_dat_cnt <- df_dat_cnt %>% unite(!!col_psb, all_of(c(col_tsb, col_nsb)), sep="_vs_")
}
col_row <- intersect(c(col_psb, "Tumor_Sample_Barcode", "Sample_Id", "Subject_Id"), colnames(df_dat_cnt))[1]
df_dat_cnt <- df_dat_cnt %>% column_to_rownames(var=col_row)
df_dat_sts <- df_dat_cnt %>% mutate_if(is.numeric, function(x) as.integer(as.logical(x)))
plot_names <- colnames(df_dat_cnt)
colnames(df_dat_cnt) <- paste0(colnames(df_dat_cnt), "_", data_type, "_count_", level_2)
colnames(df_dat_sts) <- paste0(colnames(df_dat_sts), "_", data_type, "_status_", level_2)
df_cov_cnt <- data.frame(Covariate=colnames(df_dat_cnt), Plot_Name=plot_names) %>%
mutate(Nature="Continuous", Class_Lvl_1=data_type,
Class_Lvl_2=paste0(evt_type, "_Counts_", str_to_title(agg)),
Class_Lvl_3=paste0(level_1, "_", level_2))
df_cov_sts <- data.frame(Covariate=colnames(df_dat_sts), Plot_Name=plot_names) %>%
mutate(Nature="Binary", Class_Lvl_1=data_type,
Class_Lvl_2=paste0(evt_type, "_Status_", str_to_title(agg)),
Class_Lvl_3=paste0(level_1, "_", level_2))
if (col_row!="Subject_Id" & col_row!=col_psb){
if (data_type=="RNA"){
col_row <- "Sample_Id_RNA_T"
} else {
col_row <- "Sample_Id_DNA_T"
}
}
df_dat_cnt <- df_dat_cnt %>% as.data.frame() %>% rownames_to_column(var=col_row)
df_dat_sts <- df_dat_sts %>% as.data.frame() %>% rownames_to_column(var=col_row)
dfs$dat <- left_join(dfs$dat, df_dat_cnt, by=col_row)
dfs$cov <- bind_rows(dfs$cov, df_cov_cnt)
if (!agg %in% c("total")){
dfs$dat <- left_join(dfs$dat, df_dat_sts, by=col_row)
dfs$cov <- bind_rows(dfs$cov, df_cov_sts)
}
cat(" done!\n")
}
print_size_object(dfs)
print_total_memory()
dfs
}
print_size_object <- function(obj){
obj_name <- deparse(substitute(obj))
obj_size <- object_size(obj)
if (obj_size > 1e9){
print_size <- round(obj_size/1e9, 2)
unit_size <- "GB"
} else {
print_size <- round(obj_size/1e6, 2)
unit_size <- "MB"
}
cat(paste("-INFO MEMORY: size of", obj_name, "is", print_size, unit_size, "\n"))
}
print_total_memory <- function(){
mem_size <- mem_used()
if (mem_size > 1e9){
print_size <- round(mem_size/1e9, 2)
unit_size <- "GB"
} else {
print_size <- round(mem_size/1e6, 2)
unit_size <- "MB"
}
cat(paste("-INFO MEMORY: total memory usage is", print_size, unit_size, "\n"))
}
After running this line, the activity monitor reports the following statistics for the R process running
However, running mem_used
from R console returns
266 MB
What is the reason for having a RSS > 3Gb when R objects occupy 266 Mb? And why is VMZ so huge? During the execution of
dfs <- add_counts(args$dna_alt_counts_pathway, dfs, agg="pathway", evt_type="Alteration", data_type="DNA", cohort=args$cohort)
the VMZ increased from about 10Gb to >30Gb and it does not seem to decrease a lot after the execution is finished.
Thank you very much for your help, I can't get my head around this issue!
Best, Yoann