0

I am running an R script to analyze some biological data. Example of the snippet data and script is provided below. This data file has many columns (but I would like to focus on 5th column- Gene). I have more than 100 data files like this (consider 5th Gene column in all files as the column of interest). Currently I am running each file separately in R and the process is tedious. I would like to run a R script for all data files simultaneously and save all the data in different folder as per the file name. Is it possible to loop or iterate and analyze all the data files at once in a script. Please assist me with this.

Thank you,

Toufiq

Formatted question and Revised dataframe

Inserted: Names of the files to be read

M3.1_IPA.txt
M8.1_IPA.txt
M8.2_IPA.txt
M8.3_IPA.txt

1. Load gene sets to analyze

Data_file <- read.delim(file = "./M3.1_IPA.txt", stringsAsFactors = FALSE, check.names = FALSE, row.names = NULL)

dput(head(Data_file, 5))
structure(list(row.names = c("M3.1_ALPP", "M3.1_ALS2CR14", "M3.1_ANKRD30B", 
"M3.1_ARL16", "M3.1_BCYRN1"), X = 1:5, Module = c("M3.1", "M3.1", 
"M3.1", "M3.1", "M3.1"), Title = c("Cell Cycle", "Cell Cycle", 
"Cell Cycle", "Cell Cycle", "Cell Cycle"), Gene = c("ALPP", "ALS2CR14", 
"ANKRD30B", "ARL16", "BCYRN1"), Probes = c("ILMN_1693789", "ILMN_1787314", 
"ILMN_1730678", "ILMN_2188119", "ILMN_1678757"), Module_gene = c("M3.1_ALPP", 
"M3.1_ALS2CR14", "M3.1_ANKRD30B", "M3.1_ARL16", "M3.1_BCYRN1"
)), row.names = c(NA, 5L), class = "data.frame")

Module_10_1 <- Data_file[,5]
Module_10_1_geneLists <- list(Module_10_1_geneListName=Module_10_1)
  1. Select motif database to use (i.e. organism and distance around TSS)

    data(motifAnnotations_hgnc)
    motifRankings <- importRankings("./hg19-tss-centered-10kb-7species.mc9nr.feather")
    
  2. Calculate enrichment

    motifs_AUC <- calcAUC(Module_10_1_geneLists, motifRankings, nCores=1)
    Type = "Enrichment"
    Module = "M10_1"
    auc <- getAUC(motifs_AUC)
    
    ##Export the plots##
    pdf(paste0(Type, "_", Module, "_AUC_Histogram_Plot.pdf"), height = 5, width = 10)
        hist(auc, main="Module_10_1", xlab="AUC histogram",
          breaks=100, col="#ff000050", border="darkred")
        nes3 <- (3*sd(auc)) + mean(auc)
        abline(v=nes3, col="red")
    dev.off()
    
  3. Select significant motifs and/or annotate to TFs

    motifEnrichmentTable <- addMotifAnnotation(motifs_AUC, nesThreshold=3,
    
    motifAnnot=motifAnnotations_hgnc)
    
    ##Export the Motif enrichment analysis results###
    write.csv(motifEnrichmentTable, file ="./motifEnrichmentTable.csv", row.names = F, sep = ",")
    
  4. Identify the genes with the best enrichment for each Motif

    motifEnrichmentTable_wGenes_v1 <- addSignificantGenes(motifEnrichmentTable, 
                 rankings=motifRankings, 
                 geneSets=Module_10_1_geneLists)
    
    ##Export the Motif enrichment analysis results##
    write.csv(motifEnrichmentTable_wGenes_v1, 
              file ="./motifEnrichmentTable_wGenes_v1.csv", 
              row.names = F, sep = ",")
    
  5. Plot for a few motifs

    geneSetName <- "Module_10_1_Genelist"
    selectedMotifs <- c("cisbp__M6275", 
    
    sample(motifEnrichmentTable$motif, 2))
    par(mfrow=c(2,2))
    
    Type_v1 <- "Few_motifs"
    ##Export the plots
    pdf(paste0(Type_v1, "_", Module, "_Sig_Genes_Plot.pdf"), height = 5, width = 5)
    getSignificantGenes(Module_10_1_geneLists, 
                        motifRankings,
                         signifRankingNames=selectedMotifs,
                         plotCurve=TRUE, maxRank=5000, genesFormat="none",
                         method="aprox")
    dev.off()
    
  6. The final output of RcisTarget is a data.table and export to html

    motifEnrichmentTable_wGenes_v1_wLogo <- addLogo(motifEnrichmentTable_wGenes_v1)
    resultsSubset <- motifEnrichmentTable_wGenes_v1_wLogo[1:147,]
    Type_v2 <- "Motifs_Table"
    
    library(DT)
    ## export the data table to html file format###
    dtable <- datatable(resultsSubset[,-c("enrichedGenes", "TF_lowConf"), with=FALSE], 
                    escape = FALSE, # To show the logo
                    filter="top", options=list(pageLength=100))
    html_test <- "dtable.html"
    saveWidget(dtable, html_test)
    
  7. Building a network and Export to the html format

    signifMotifNames <- motifEnrichmentTable$motif[1:3]
    incidenceMatrix <- getSignificantGenes(Module_10_1_geneLists, 
                                        motifRankings,
                                        signifRankingNames=signifMotifNames,
                                        plotCurve=TRUE, maxRank=5000, 
                                        genesFormat="incidMatrix",
                                        method="aprox")$incidMatrix
    
    library(reshape2)
    
    edges <- melt(incidenceMatrix)
    edges <- edges[which(edges[,3]==1),1:2]
    colnames(edges) <- c("from","to")
    
    library(visNetwork)
    motifs <- unique(as.character(edges[,1]))
    genes <- unique(as.character(edges[,2]))
    nodes <- data.frame(id=c(motifs, genes),   
                        label=c(motifs, genes),    
                        title=c(motifs, genes), # tooltip 
                        shape=c(rep("diamond", length(motifs)), rep("elypse", length(genes))),
                        color=c(rep("purple", length(motifs)), rep("skyblue", length(genes))))
    Network <- visNetwork(nodes, edges) %>% visOptions(highlightNearest = TRUE, 
                                         nodesIdSelection = TRUE)
    ##Export the network##
    visSave(Network, file ="Network.html")
    
  • 1
    Just to be clear, do you want to loop over files? So the code you are after would read in a different file in the second line `Data_file <- read.csv(...)` for each for loop iteration? – Dan Kennedy Dec 13 '20 at 20:49
  • @Dan Kennedy, thank you very much for your comment. Yes, I want to loop over files. – Mohammed Toufiq Dec 13 '20 at 21:06
  • 1
    Ok, that is possible, you will just need to create a vector with the file names in it and loop over that. Can you amend your question to include the names of the files you want to read in? I can then write a proper answer to solve the issue. – Dan Kennedy Dec 13 '20 at 21:29
  • @Dan Kennedy, thanks. The question and snippet dataframe has been revised. Also, included the names of the files to be read. – Mohammed Toufiq Dec 14 '20 at 05:03

2 Answers2

1

I've written it based on the example file names given:

file_names <- c("M3.1_IPA.txt","M8.1_IPA.txt","M8.2_IPA.txt","M8.3_IPA.txt")

Easiest way is to iterate over 1:length(file_names), and create a unique set of output files for each iteration. As per your question, you want to save the results to different folders. This can be done by extracting the file name (removing the .txt) and then creating a new directory with that name. You can then save all your outputs for this iteration to the new directory

A new directory is then created for the next iteration, so you aren't saving over previous results.

for(i in 1:length(file_names)){
  Data_file <- read.csv(file = paste0("./",file_names[i]), stringsAsFactors = FALSE, check.names = FALSE)

  Module_10_1 <- Data_file[,1]
  Module_10_1_geneLists <- list(Module_10_1_geneListName=Module_10_1)
  
  data(motifAnnotations_hgnc)
  motifRankings <- importRankings("./hg19-tss-centered-10kb-7species.mc9nr.feather")
  
  
  motifs_AUC <- calcAUC(Module_10_1_geneLists, motifRankings, nCores=1)
  Type = "Enrichment"
  Module = "M10_1"
  auc <- getAUC(motifs_AUC)
  
  # Extract the file name without the extension, so the directory with the results
  # will correspond to this name:
  new_directory_name <- gsub(x = file_names[i],pattern = "(.*)\\.txt","\\1")
  #Create the new directory:
  dir.create(new_directory_name)

  ##Export the plots##
  # Save to the new directory
  pdf(paste0(new_directory_name,"/",Type,"_", Module,"_AUC_Histogram_Plot.pdf"), height = 5, width = 10) 
  hist(auc, main="Module_10_1", xlab="AUC histogram",
       breaks=100, col="#ff000050", border="darkred")
  nes3 <- (3*sd(auc)) + mean(auc)
  abline(v=nes3, col="red")
  dev.off()

}

I've omitted the rest of your script because the solution for saving motifEnrichmentTable_wGenes_v1.csv, _Sig_Genes_Plot.pdf, and the other outputs is the same as above with _AUC_Histogram_Plot.pdf. You just need to write these outputs to the new directory with paste0(new_directory_name,"/",<insert-output-name>).

If you've got many files, instead of manually creating the vector file_names you can search the local directory for files which match the correct pattern. e.g.

all_files <- dir()
file_names <- grep(all_files,pattern = "*.txt$",value = TRUE)

will return all the .txt files in the local directory. You can refine it further if it needs to be only .txt files starting with "M":

all_files <- dir()
file_names <- grep(all_files,pattern = "^M.*.txt$",value = TRUE)

You'd do this if not all of the .txt files in the local directory are input files, and you may need to refine the regular expression pattern further until you are only capturing the txt files you need.

For the .html files it is a little more difficult because saveWidget() currently doesn't let you save using relative file paths, but there is a solution using withr::with_dir(). This should work for Part 7:

withr::with_dir(new_directory_name, saveWidget(dtable, file="dtable.html"))

and this should work for part 8:

withr::with_dir(new_directory_name, saveWidget(Network, file="Network.html"))
Dan Kennedy
  • 380
  • 1
  • 9
  • excellent. Thank you for the very helpful inputs. This is very helpful. I could export the *.csv and pdf files easily. One question, please let me know know how to do we export the *.html files for my 7 and 8 branch code above when we are running this kind of code. I tried couple of functions, it does not work. – Mohammed Toufiq Dec 14 '20 at 11:37
  • could you please provide inputs/example about exporting in html file format for branch 7 and 8 using the "paste0(new_directory_name,"/",)" function. – Mohammed Toufiq Dec 14 '20 at 15:04
  • 1
    Just clarifying, did you try `html_test <- paste0(new_directory_name,"/","dtable.html")` and it didn't work? – Dan Kennedy Dec 14 '20 at 21:50
  • i tried running the suggested code, it does not save the html files in directories. Pasting the code for 7 and 8 branch in another comment/answer to your question section. Please assist. – Mohammed Toufiq Dec 15 '20 at 04:43
  • Ah I see. Looks like this is a known issue with the `saveWidget()` function: https://github.com/ramnathv/htmlwidgets/issues/299 – Dan Kennedy Dec 15 '20 at 05:44
  • I've edited the answer to use the solution on the github issue I linked. It should work now for your widgets, saving them to html. – Dan Kennedy Dec 15 '20 at 05:49
  • 1
    thank you very much for the continued assistance. Much appreciated. – Mohammed Toufiq Dec 15 '20 at 06:14
  • there is one glitch in the script. When I apply the script to large number of files in the directory. The script throws an error and stops running on the remaining files in the directory. This could be due to some of the files does not have any of matching genes/features (Module_10_1_geneLists) compared to gene-sets 'rankings' (i.e motifRankings). Any inputs/functions on how to surpass the pass/execute in the loop when one of these files types are present. Thank you. – Mohammed Toufiq Dec 17 '20 at 08:16
  • 'Error in .RcisTarget_calcAUC(geneSets = geneSets, rankings = rankings, : Fewer than 80% of the genes/features in the gene-sets are included in the rankings.Check wether the IDs in the 'rankings' (columns) and 'geneSets' match.' – Mohammed Toufiq Dec 17 '20 at 08:17
  • 1
    Looks like you just want the for loop to continue for subsequent iterations after an error. This answer should help: https://stackoverflow.com/questions/14748557/skipping-error-in-for-loop – Dan Kennedy Dec 18 '20 at 00:34
0

7. The final output of RcisTarget is a data.table and export to html

motifEnrichmentTable_wGenes_v1_wLogo <- addLogo(motifEnrichmentTable_wGenes_v1)
  resultsSubset <- motifEnrichmentTable_wGenes_v1_wLogo[1:147,]
  Type_v2 <- "Motifs_Table"
  
  # Extract the file name without the extension, so the directory with the results
  # will correspond to this name:
  new_directory_name <- gsub(x = file_names[i],pattern = "(.*)\\.txt","\\1")
  #Create the new directory:
  dir.create(new_directory_name)
  
  library(DT)
  library(webshot)
  library(htmltools)
  library(xml2)
  ## export the data table to html file format###
  dtable <- datatable(resultsSubset[,-c("enrichedGenes", "TF_lowConf"), with=FALSE], 
                      escape = FALSE, # To show the logo
                      filter="top", options=list(pageLength=1000))
  html_test <- paste0(new_directory_name,"/","dtable.html")
  ##To save files in the pdf format###
  #saveWidget(dtable, html_test)
  #webshot(html_test, paste0(new_directory_name,"/", "motif.pdf"))

8. Building a network and Export to the html format

# Extract the file name without the extension, so the directory with the results
  # will correspond to this name:
  new_directory_name <- gsub(x = file_names[i],pattern = "(.*)\\.txt","\\1")
  #Create the new directory:
  dir.create(new_directory_name)
  
  signifMotifNames <- motifEnrichmentTable$motif[1:3]
  incidenceMatrix <- getSignificantGenes(Module_10_1_geneLists, 
                                         motifRankings,
                                         signifRankingNames=signifMotifNames,
                                         plotCurve=TRUE, maxRank=5000, 
                                         genesFormat="incidMatrix",
                                         method="aprox")$incidMatrix
  
  library(reshape2)
  
  edges <- melt(incidenceMatrix)
  edges <- edges[which(edges[,3]==1),1:2]
  colnames(edges) <- c("from","to")
  
  library(visNetwork)
  motifs <- unique(as.character(edges[,1]))
  genes <- unique(as.character(edges[,2]))
  nodes <- data.frame(id=c(motifs, genes),   
                      label=c(motifs, genes),    
                      title=c(motifs, genes), # tooltip 
                      shape=c(rep("diamond", length(motifs)), rep("elypse", length(genes))),
                      color=c(rep("purple", length(motifs)), rep("skyblue", length(genes))))
  Network <- visNetwork(nodes, edges) %>% visOptions(highlightNearest = TRUE, 
                                                     nodesIdSelection = TRUE)
  ##Export the network##
  #visSave(Network, file ="Network.html")

  html_test_v1 <- paste0(new_directory_name,"/","Network.html")
  #saveWidget(Network, html_test_v1)
  #webshot(html_test_v1, paste0(new_directory_name,"/", "Network.pdf"))