0

I have two matrices like

I want to get some intersection

 > head(small[1:3,])
  Chrom1    Start1      End1 Strand1 Chrom2    Start2      End2
1      1  28677074  28677079       +      1  28706324  28706329
2      1 186383731 186383738       +      1 186383731 186383732
3      1 245902589 245902590       +      1 246007384 246007385
  Strand2
1       -
2       -
3       -
> 
    > dim(small)
[1] 1594    8

https://www.dropbox.com/s/mkplrg1236f8qtd/111.txt?dl=0

And a big matrix

> head(big[1:3,])
  Ensembl_Gene_ID Chromosome_Name Gene_Start Gene_End
1 ENSG00000233440              13   23708313 23708703
2 ENSG00000207157              13   23726725 23726825
3 ENSG00000229483              13   23743974 23744736
  Associated_Gene_Name X5UTR_Start X5UTR_End X3UTR_Start X3UTR_End
1              HMGA1P6          NA        NA          NA        NA
2               RNY3P4          NA        NA          NA        NA
3            LINC00362          NA        NA          NA        NA
  Transcript_count Gene_Biotype Exon_Chr_Start Exon_Chr_End
1                1   pseudogene       23708313     23708703
2                1     misc_RNA       23726725     23726825
3                1      lincRNA       23744691     23744736
  Exon_Rank_in_Transcript Ensembl_Transcript_ID Transcript_Start
1                       1       ENST00000418454         23708313
2                       1       ENST00000384428         23726725
3                       1       ENST00000414345         23743974
  Transcript_End strand
1       23708703      1
2       23726825     -1
3       23744736     -1
> dim(big)
[1] 1048575      18
> 

https://www.dropbox.com/s/bit4iw2ne19td63/big.txt?dl=0

I need somethis like

Chrom1  Start1  End1    Strand1 Chrom2  Start2  End2    Strand2 GeneName.node1  GeneName.node2
chr1    14480603    14481217    +   chr1    14747789    14748719    -   -   -
chr1    16169956    16170596    +   chr1    16217823    16218463    -   RP11-169K16.9   SPEN

I have R script like

#### Determining breakpoint locations: and adding to table

small$breakpoint1 <- apply(small[,c("Strand1","Start1","End1")], 1,
                             function(x) ifelse(x[1] == "+",as.numeric(x[3]),
                                                as.numeric(x[2])))
small$breakpoint2 <- apply(small[,c("Strand2","Start2","End2")], 1,
                             function(x) ifelse(x[1] == "+",as.numeric(x[3]),
                                                as.numeric(x[2])))
svinfo$breakpoint1.ordered <- apply(svinfo[,c("breakpoint1","breakpoint2")],1,
                                     function(x) min(as.numeric(x[1]),as.numeric(x[2])))
svinfo$breakpoint2.ordered <- apply(svinfo[,c("breakpoint1","breakpoint2")],1,
                                     function(x) max(as.numeric(x[1]),as.numeric(x[2])))

  #######
  ### Start SV annotation:
  gr.hg19.gene <-  GRanges(
    seqnames = Rle(hg19$Chromosome_Name),
    ranges = IRanges(start=as.numeric(hg19$Gene_Start), end = as.numeric(hg19$Gene_End)),
    EnsemblGeneID = hg19$Ensembl_Gene_ID,
    GeneName = hg19$Associated_Gene_Name,
    TranscriptCount = hg19$Transcript_count,
    ExonChrStart = hg19$Exon_Chr_Start,
    ExonChrEnd = hg19$Exon_Chr_End,
    ExonRankInTranscript = hg19$Exon_Rank_in_Transcript,
    EnsemblTranscriptID = hg19$Ensembl_Transcript_ID,
    TranscriptStart = hg19$Transcript_Start,
    TranscriptEnd = hg19$Transcript_End,
    GeneStrand = hg19$Strand,
    ExonID = paste(hg19$Exon_Chr_Start,hg19$Exon_Chr_End,sep=".")
  )

  gr.svinfo.node1 <- GRanges(
    seqnames = Rle(svinfo$Chrom1),
    ranges = IRanges(start=as.numeric(svinfo$breakpoint1), end = as.numeric(svinfo$breakpoint1)),
    Type = svinfo$Type,
    Node1.Strand = svinfo$Strand1,
    ID = svinfo$ID
  )
  gr.svinfo.node2 <- GRanges(
    seqnames = Rle(svinfo$Chrom2),
    ranges = IRanges(start=as.numeric(svinfo$breakpoint2), end = as.numeric(svinfo$breakpoint2)),
    Type = svinfo$Type,
    Node2.Strand = svinfo$Strand2,
    ID = svinfo$ID
  )

But I don't know how to get genes related to each parts of small matrix

Could any one please help me?

Angel
  • 151
  • 1
  • 11

2 Answers2

2

The following should do it as a first step. First load your (modified) data example:

small <- read.table(text = "Chromosome chromStart  chromEnd
1          1   28677074  28677079
2          1  186383731 186383738
3          1  245902589 245902590
4          2   56345643  56345645
5          3   59766214  59766217
6          3   60270545  60270548")

big <- read.table(text = "
Chromosome chromStart chromEnd      Gene
1         1   28677075 28677078   HMGA1P6
2         13   23726725 23726825    RNY3P4
3         13   23743974 23744736 LINC00362
4         13   23743974 23744736 LINC00362
5         13   23791571 23791673  RNU6-58P
6         13   23817659 23821323  TATDN2P3")

Next, the code for identifying te genes corresponding to the regions.

small$Gene <- NA  # Initialize an "empty" colum to fill
for (i in seq_len(nrow(small))) {
  # Find indicies where the genes falls into the chromosome and region
  j <- which(big$Chromosome == small[i, "Chromosome"] &
               big$chromStart >= small[i, "chromStart"] &
               big$chromEnd <= small[i, "chromEnd"])

  # Fetch the gene corresponding to the indicies and collapse (if more than one)
  small[i, "Gene"] <- paste(big$Gene[j], collapse = ";")
}

print(small)
#  Chromosome chromStart  chromEnd    Gene
#1          1   28677074  28677079 HMGA1P6
#2          1  186383731 186383738        
#3          1  245902589 245902590        
#4          2   56345643  56345645        
#5          3   59766214  59766217        
#6          3   60270545  60270548  

Of course, using a for-loop might not be optimal. But notice we a looping over the small matrix, and utilizing vectorization by comparing each row in small with all rows in big. It should be fairly quick even on your full data.

As you need to compare each gene with every region on the chromosome, you can probably optimize code a bit more for speed.

I have "accounted" for the possibility a more than one gene might fall in the region defined by the records of small.

Edit:

If you are looking for simply "overlap" of chromosome regions and genes, you need to define j above like the following instead:

j <- which(
  big$Chromosome == small[i, "Chromosome"] & (
    (small[i, "chromStart"] <= big$chromStart & big$chromStart <= small[i, "chromEnc"]) | # Gene starts in region
    (small[i, "chromStart"] <= big$chromEnd & big$chromEnd <= small[i, "chromEnd"])  # Gene ends in region
  ) 
)

If I am not mistaken. Basically, this should check if the genes start or stop within the region.

Anders Ellern Bilgrau
  • 9,928
  • 1
  • 30
  • 37
  • Thank you but returned empty column Gene in small matrix :( – Angel May 05 '19 at 18:35
  • Do you agree with the logic of the code? I.e. for a given region in `small`, you want find the genes that fall within it, correct? – Anders Ellern Bilgrau May 05 '19 at 18:36
  • Yes you right, so you mean there is no genes in overlap of ranges in two matrices? – Angel May 05 '19 at 18:37
  • The code above looks for genes whose regions are *entirely* contained with in the regions defined in small. It does *not* check for **general** overlap. The start of the gene should be *after* the start of the chromosome region, and the end of the gene should be *before* the end of the chromosome region. (If I have not made a mistake). Do you have any examples of 'overlap'? – Anders Ellern Bilgrau May 05 '19 at 18:42
  • I have added some code which tries to look for "overlap". If neither works (depending on what you want), I suspect there are no examples in your data. – Anders Ellern Bilgrau May 05 '19 at 18:52
  • Sorry I have small matrix and big matrix (annotation) even R code for annotation but I don't know how to use that – Angel May 05 '19 at 18:53
1

This seems like foverlaps from data.table would be useful. See this answer:

Finding Overlaps between interval sets / Efficient Overlap Joins

Using @Anders Ellern Bilgrau's dataset, this implements foverlaps

library(data.table)

setDT(small)
setDT(big)

setkey(big, Chromosome, chromStart, chromEnd)

foverlaps(small, big)

#   Chromosome chromStart chromEnd    Gene i.chromStart i.chromEnd
#1:          1   28677075 28677078 HMGA1P6     28677074   28677079
#2:          1         NA       NA    <NA>    186383731  186383738
#3:          1         NA       NA    <NA>    245902589  245902590
#4:          2         NA       NA    <NA>     56345643   56345645
#5:          3         NA       NA    <NA>     59766214   59766217
#6:          3         NA       NA    <NA>     60270545   60270548
Cole
  • 11,130
  • 1
  • 9
  • 24