0

I have two files one contains the specific chromosome position and other contains the gene_name and gene's start to end range. I have to find the gene name by matching the in-between start to end position of the genes to position of chromosome.Format of my 1st file is like below
Chromosome Position 1 394 1 447 2 534

Format of my 2nd file is:

gene_name  chromoome  start  end
   pqr         1       201   230
   sbc         1       300   450
   ffg         2       500   550

I have tried the following code

setwd('/home/R/')
data = read.table(file='outfile.tsv', fill = TRUE)
data1 = read.table(file='Sample.tsv')
chr = data1[,1]
pos = data1[,2]
gene = data[,1]
beg = data[,3]
end = data[,4]

pos_sz = dim.data.frame(pos)
beg_sz = dim.data.frame(beg)
end_sz = dim.data.frame(end)

for (i in 1:length(pos))
{
pos_1 = pos[i]
x = pos_1>=beg & pos_1<=end
print(x)
if(any(x == "TRUE"))
{
t=pos[i]
print(t)
s = which(pos == t)
print(s)
v= chr[s]
print(v)
}
y=which(c(x))
print(y)
z=gene[y]
print(z)
}

I want result in below format

gene_name   Chromosome   #chromosome against position
sbc              1
sbc              1
ffg              2

Any help would be appriciated

  • 1
    Your below format just has the column names. What's your desired output? – sm925 Mar 05 '18 at 16:16
  • 1
    Related: [roll join with start/end window](https://stackoverflow.com/questions/24480031/roll-join-with-start-end-window) – Henrik Mar 05 '18 at 16:22

1 Answers1

0

You can try using the GenomicRanges package

library(GenomicRanges)
# data
target <- read.table(text="gene_name  chromoome  start  end
   pqr         1       201   230
   sbc         1       300   450
   ffg         2       500   550", header=T)

# set up GRange objects
d <- GRanges(c(1,1,2), IRanges(c(394,447,534), width=1))
target_range <- GRanges(target$chromoome, IRanges(start=target$start, end=target$end))

# get overlaps
OL <- findOverlaps(d, target_range)
target[as.data.frame(OL)[,2],]
    gene_name chromoome start end
2         sbc         1   300 450
2.1       sbc         1   300 450
3         ffg         2   500 550

Or by merging

# merge
target_range <- GRanges(target$chromoome, IRanges(start=target$start, end=target$end), names=target$gene_name)
mergeByOverlaps(d, target_range)
DataFrame with 3 rows and 3 columns
          d target_range    names
  <GRanges>    <GRanges> <factor>
1 1:394-394    1:300-450      sbc
2 1:447-447    1:300-450      sbc
3 2:534-534    2:500-550      ffg
Roman
  • 17,008
  • 3
  • 36
  • 49