I am doing a analysis where i need to add transcript id
This is not actual data .only for reference query file =
structure(list(geneSymbol = c("A", "AR", "A1", "A12",
"A7", "A9A"), chr = c("chr1", "chr1", "chr1", "chr5", "chr3",
"chr2"), exonStart_0base = c(105259463, 32128564, 131173030,
115176193, 11322722, 220093145), exonEnd = c(105259641, 32128639,
131173039, 115176309, 11322788, 220093207), upstreamES = c(105258934,
32120666, 131172109, 115173324, 11313994, 220092643), upstreamEE = c(105259059,
32120728, 131172210, 115173461, 11314116, 220092775), downstreamES = c(105261820,
32132388, 131179781, 115176514, 11323878, 220094256), downstreamEE = c(105261841,
32132514, 131179868, 115176631, 11324124, 220094361)), .Names = c("geneSymbol",
"chr", "exonStart_0base", "exonEnd", "upstreamES", "upstreamEE",
"downstreamES", "downstreamEE"), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
File structure where transcript id is present -
structure(list(X1 = c(1, 2, 3, 4, 5, 6), V1 = c("chr1", "chr1",
"chr1", "chr1", "chr1", "chr1"), V2 = c("protein_coding", "protein_coding",
"protein_coding", "protein_coding", "protein_coding", "protein_coding"
), V3 = c("exon", "exon", "exon", "exon", "exon", "exon"), V4 = c(53049,
54830, 69091, 137621, 134901, 367640), V5 = c(53067, 54936, 70008,
139379, 135802, 368634), gene_id = c("ENSG00000268020;", "ENSG00000268020;",
"ENSG00000186092;", "ENSG00000237683;", "ENSG00000237683;", "ENSG00000235249;"
), transcript_id = c("ENST00000594647", "ENST00000594647", "ENST00000335137",
"ENST00000423372", "ENST00000423372", "ENST00000426406"), exon_number = c(1,
2, 1, 1, 2, 1), gene_name = c("AR", "AL627309.2", "OR4F5",
"AL627309.1", "AL627309.1", "A"), exon_id = c("ENSE00003076518",
"ENSE00003074125", "ENSE00002319515", "ENSE00002221580", "ENSE00002314092",
"ENSE00002316283")), .Names = c("X1", "V1", "V2", "V3", "V4",
"V5", "gene_id", "transcript_id", "exon_number", "gene_name",
"exon_id"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L))
For try bases -- First i did manually ==
ma = c('A')
gene_file = filter(chr1 , chr1$gene_name == ma )
a = 183471387
b =183471526
a_start = filter(gene_file , gene_file$V4 >= a)
a_stop = filter(a_start, a_start$V5 <= b)
a_stop has data so desire results.
But due to multiple gene name can't do this like this . So, i try result = data.frame(length(chr1))
for (i in length(SE_transcripts))
{
ma = SE_transcripts$geneSymbol[i]
a = SE_transcripts$exonStart_0base[i]
b = SE_transcripts$exonEnd[i]
gene_file = filter(chr1 , chr1$gene_name == ma )
if (a != 0 )
{
idx= gene_file$V4 >= a && gene_file$V5 <= b
}
result[i,1] = gene_file$transcript_id[idx]
}
I think there is error in if loop .
I need to add transcript id , exon id , gene id in query file.
Please help to sort this .