1

I am trying to extract some information (metadata) from GenBank using the R package "rentrez" and the example I found here https://ajrominger.github.io/2018/05/21/gettingDNA.html. Specifically, for a particular group of organisms, I search for all records that have geographical coordinates and then want to extract data about the accession number, taxon, sequenced locus, country, lat_long, and collection date. As an output, I want a csv file with the data for each record in a separate row. It seems that the code below can do the job but at some point, rows get muddled with data from different records overlapping the neighbouring rows. For example, from 157 records that rentrez retrieves from NCBI 109 records in the file look like what I want to achieve but the rest is a total mess. I would greatly appreciate any advice on how to fix the issue because I am a total newbie with R and figuring out each step takes a lot of time.

setwd ("C:/R-Works")
library('XML')
library('rentrez')
argasid <- entrez_search(db="nuccore", term = "Argasidae[Organism] AND [lat]", use_history=TRUE, retmax=15000)
x <- entrez_fetch (db="nuccore", id=argasid$ids, rettype= "native", retmode="xml", parse=TRUE)
x <-xmlToList(x)

cleanEntrez <- function(x) {
  basePath <- 'Seq-entry_seq.Bioseq'
  c(
    genbank = as.character(x[paste(basePath, 
                                   'Bioseq_id', 'Seq-id', 'Seq-id_genbank', 
                                   'Textseq-id', 'Textseq-id_accession', 
                                   sep = '.')]),
    taxon = as.character(x[paste(basePath,
                                 'Bioseq_descr', 'Seq-descr', 'Seqdesc', 
                                 'Seqdesc_source', 'BioSource', 'BioSource_org', 
                                 'Org-ref', 'Org-ref_taxname', 
                                 sep = '.')]),
    bseqdesc_title = as.character(x[paste(basePath,
                                          'Bioseq_descr', 'Seq-descr', 'Seqdesc', 
                                          'Seqdesc_title', 
                                          sep = '.')]),
      lat_lon = as.character(x[grep('lat-lon', x) + 1]),
      geo_description = as.character(x[grep('country', x) + 1]),
      coll_date = as.character(x[grep('collection-date', x) + 1])
  )
}
getGenbankMeta  <- function(ids) {
  allRec <- entrez_fetch(db = 'nuccore', id = ids, 
                         rettype = 'native', retmode = 'xml', 
                         parsed = TRUE)
  allRec <- xmlToList(allRec)[[1]]
  
  o <- lapply(allRec, function(x) {
    cleanEntrez(unlist(x))
  })
  
  temp <- array(unlist(o), dim = c(length(o[[1]]), length(ids)))
  seqVec <- temp[nrow(temp), ]
  seqDF <- as.data.frame(t(temp[-nrow(temp), ]))
  names(seqDF) <- names(o[[1]])[-nrow(temp)]
  
  return(list(seq = seqVec, data = seqDF))
} 
write.csv(getGenbankMeta(argasid$ids), 'argasid_georef.csv')
 

0 Answers0