I have a file of genomic coordinates (p) and another file of variants with their locations (vars). I want to get the variants who have a start position greater than the genomic start position and an end position less that the genomic end position. The length of the variants file is less than the coordinate file. I am constantly getting this error, "longer object length is not a multiple of shorter object length." Thanks for assistance if anyone can help!!
EDIT: Here is the structure of the data:
>dput(droplevels(head(p, 4)))
structure(list(chr = structure(c(1L, 1L, 1L, 1L), .Label = "chr13", class = "factor"),
chrStart = c(19019000L, 19020000L, 19020000L, 19021000L),
chrEnd = c(19020000L, 19020000L, 19021000L, 19021000L),Number = c(1L,
29L, 53L, 60L)), .Names = c("chr", "chrStart", "chrEnd",
"Number"), row.names = c(NA, 4L), class = "data.frame")
>dput(droplevels(head(chr13, 4)))
structure(list(Var = structure(1:4, .Label = c("13:23798029-23799959",
"13:19019221-19019456", "13:19018226-19019462", "13:94818369-94822017"
), class = "factor"), Chr = c(13L, 13L, 13L, 13L), vStart = c(23798029L,
85571820L, 94818226L, 94818369L), vEnd = c(23799959L, 85574142L,
94822462L, 94822017L), CpG = structure(c(3L, 2L, 1L, 1L), .Label =c("cg17183991",
"cg17921034", "cg26611683"), class = "factor"), Gene = structure(c(2L,
1L, 3L, 3L), .Label = c("AKAP11", "FOXO1A", "HS6ST3"), class = "factor"),
width = c(16338960L, 43828646L, 720767L, 720918L), p = c(0.424,
0.418, 0.385, 0.338), X.NAME. = c(3.9026, 3.8357, 2.3456,
2.583), X.NAME..1 = c(3.7245, 3.7267, 2.3467, 2.2076), X.NAME..2 = c(4.8623,
4.7102, 3.2994, 3.1719), Sourc = structure(c(2L, 2L, 1L,
2L), .Label = c("T1", "T2"), class = "factor"),
NominalPvalue = c(0.023992, 0.0002875, 0.0049597,
0.002612036)), .Names = c("Var", "Chr", "vStart", "vEnd",
"CpG", "Gene", "width", "p", "X.NAME.", "X.NAME..1",
"X.NAME..2", "Sourc", "Normalized"), row.names = c(29L, 278L,
304L, 305L), class = "data.frame")
p <- read.csv("chr13.csv", header = TRUE)
vars <- read.csv("../variants.csv", header=T)
chr13 <-subset(vars, vars$Chr=="13")
for(i in 1:nrow(p)){
curP <- p[i,]
k <- subset(chr13, chr13$vStart > curP$chrStart & chr13$vEnd < curP$chrEnd)
for(m in 1:nrow(k)){
curM <- k[m, ]
x <- as.data.frame(curM[,1:ncol(curM)])
y <- as.data.frame(curP[,1:ncol(curP)])
if (nrow(k)>0)
keep <- data.frame(Variant=curM$Variant,
Chr=curM$Chr,
vStart=curM$vStart,
CpG=curM$Cpg,
Gene=curM$Gene,
vEnd=curM$vEnd,
ChrStart=curP$chrStart,
ChrEnd=curP$chrEnd
)
matches <- cbind(matches,keep)
}
}
file 1:
chr Start End Number
chr1 12001 13000 2
chr1 13000 13000 10
chr1 13010 14000 6
chr1 13020 15000 2
chr1 14000 15000 10
chr1 15000 15000 4
chr1 15300 16000 12
chr1 13000 51000 1
chr1 48000 52000 1
chr1 51000 52000 4
file 2:
variant chr chrStart chrEnd cpg gene
var128 1 13467 13499 cg27611665 FBXL12
var229 1 48117 48334 cg27611665 FBXL12
var509 1 568289 568419 cg2511665 FBXL12
var213 1 186392 186392 cg2558303 SLC25A4
var999 1 401909 401963 cg27472032 VPS39
var122 1 182444 182494 cg2743794 FXR1
var098 1 602184 602248 cg27398547 C14orf39
var876 1 157302 157344 cg27355746 UBTF
var287 1 163665 163709 cg2752122 PHF20L1`
So, the end result should match the following:
variant chr chrStart chrEnd CpG gene chr Start End Number
var128 1 13467 13499 cg27611665 FBXL12 chr1 13010 14000 6
var229 1 48117 48334 cg27611665 FBXL12 chr1 48000 52000 1