I have run r script in R that I have run for about a day using sqldf, now I am trying to use data.table package but still I don't know how to change all this script to work using data.table
.
> dput(df[1:4, ])
structure(list(chr = c("chr1", "chr1", "chr1", "chr1"), cpg = c(4222,
4234, 4235, 4313), count_c = c(0L, 0L, 0L, 2L), total_coverage = c(8L,
6L, 8L, 8L)), row.names = 12:15, class = "data.frame")
> dput(annotation_with_total_cpgs[1:4, ])
structure(list(gene_id = c("PSOL00004", "PSOL00004", "PSOL00004-TA",
"PSOL00004-TA"), chr = c("chr5", "chr5", "chr5", "chr5"), start = c(9914646L,
9914646L, 9914646L, 9914646L), end = c(9917882L, 9917882L, 9914818L,
9914818L), feature = c("gene", "mRNA", "CDS", "exon"), cpg_count = c(101L,
101L, 11L, 11L)), row.names = c(NA, 4L), class = "data.frame")
df = read.table("final_coverage.txt", header = T) annotation_with_total_cpgs <- read_table("total_cpgs.txt")
output <- sqldf("
SELECT sample.chr, sample.cpg, sample.count_c,
sample.total_coverage, annot.chr, annot.start,
annot.end, annot.gene_id, annot.cpg_count, annot.feature
FROM df AS sample
LEFT JOIN annotation_with_total_cpgs AS annot
ON sample.chr = annot.chr
AND (sample.cpg >= annot.start AND sample.cpg <= annot.end)
")
output <- output[!is.na(output$gene_id),]