1

I am doing a analysis where i need to add transcript id

This is not actual data .only for reference query file =

structure(list(geneSymbol = c("A", "AR", "A1", "A12", 
"A7", "A9A"), chr = c("chr1", "chr1", "chr1", "chr5", "chr3", 
"chr2"), exonStart_0base = c(105259463, 32128564, 131173030, 
115176193, 11322722, 220093145), exonEnd = c(105259641, 32128639, 
131173039, 115176309, 11322788, 220093207), upstreamES = c(105258934, 
32120666, 131172109, 115173324, 11313994, 220092643), upstreamEE = c(105259059, 
32120728, 131172210, 115173461, 11314116, 220092775), downstreamES = c(105261820, 
32132388, 131179781, 115176514, 11323878, 220094256), downstreamEE = c(105261841, 
32132514, 131179868, 115176631, 11324124, 220094361)), .Names = c("geneSymbol", 
"chr", "exonStart_0base", "exonEnd", "upstreamES", "upstreamEE", 
"downstreamES", "downstreamEE"), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame"))

File structure where transcript id is present -

structure(list(X1 = c(1, 2, 3, 4, 5, 6), V1 = c("chr1", "chr1", 
"chr1", "chr1", "chr1", "chr1"), V2 = c("protein_coding", "protein_coding", 
"protein_coding", "protein_coding", "protein_coding", "protein_coding"
), V3 = c("exon", "exon", "exon", "exon", "exon", "exon"), V4 = c(53049, 
54830, 69091, 137621, 134901, 367640), V5 = c(53067, 54936, 70008, 
139379, 135802, 368634), gene_id = c("ENSG00000268020;", "ENSG00000268020;", 
"ENSG00000186092;", "ENSG00000237683;", "ENSG00000237683;", "ENSG00000235249;"
), transcript_id = c("ENST00000594647", "ENST00000594647", "ENST00000335137", 
"ENST00000423372", "ENST00000423372", "ENST00000426406"), exon_number = c(1, 
2, 1, 1, 2, 1), gene_name = c("AR", "AL627309.2", "OR4F5", 
"AL627309.1", "AL627309.1", "A"), exon_id = c("ENSE00003076518", 
"ENSE00003074125", "ENSE00002319515", "ENSE00002221580", "ENSE00002314092", 
"ENSE00002316283")), .Names = c("X1", "V1", "V2", "V3", "V4", 
"V5", "gene_id", "transcript_id", "exon_number", "gene_name", 
"exon_id"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-6L))

For try bases -- First i did manually ==

ma = c('A')
gene_file = filter(chr1  , chr1$gene_name == ma ) 
a =   183471387  
b =183471526
a_start = filter(gene_file , gene_file$V4 >= a)
a_stop = filter(a_start, a_start$V5 <= b)

a_stop has data so desire results.

But due to multiple gene name can't do this like this . So, i try result = data.frame(length(chr1))

for (i in length(SE_transcripts))
{
    ma = SE_transcripts$geneSymbol[i]
    a = SE_transcripts$exonStart_0base[i]
    b = SE_transcripts$exonEnd[i]
    gene_file = filter(chr1  , chr1$gene_name == ma )

    if (a != 0  )
    {
       idx= gene_file$V4 >= a && gene_file$V5 <= b
    }
    result[i,1] = gene_file$transcript_id[idx]

}

I think there is error in if loop .

I need to add transcript id , exon id , gene id in query file.

Please help to sort this .

zx8754
  • 52,746
  • 12
  • 114
  • 209
sdabral
  • 11
  • 4
  • 1
    There is not need for "forloops", this is called "merge on overlap ranges/intervals", relevant post: https://stackoverflow.com/questions/24480031/overlap-join-with-start-and-end-positions – zx8754 May 17 '19 at 12:00
  • Let us know if linked post helped to solve your problem, then we can close as duplicate. – zx8754 May 17 '19 at 12:02
  • You can close this issues. thankyou . – sdabral May 20 '19 at 06:09

0 Answers0