I have a gff
file and a csv
file which looks like:
# CSV dataframe
file.csv <- read.table(text = "Sample Name Estimate Std.Err P.Adjust
Sample_1 B005300.2.1 0.345930183 0.05662846 1.58E-06
Sample_1 B005230.2.1 0.048159129 0.013862871 0.019181546
Sample_1 B006450.2.1 -0.263951161 0.079297432 0.027327576
Sample_2 B005230.2.1 39.04308043 11.23861018 0.019181546
Sample_2 B006260.1.1 0.003968994 0.00063087 6.12E-07
Sample_2 B006170.2.1 0.117171563 0.024018888 0.000272761
Sample_3 B006450.2.1 0.012033053 0.003670908 0.030632664
Sample_3 B006980.1-c2.1 -0.007653796 0.002047582 0.009944649
Sample_3 B006980.1.1 -0.011369481 0.002871014 0.00539717", header = TRUE)
# GFF GRanges, example data
#dput(head(GFF))
GFF <- new("GRanges", seqnames = new("Rle", values = structure(1L, .Label = c("Bch01", "Bch02", "Bch03", "Bch04", "Bch05"), class = "factor"), lengths = 6L,
elementMetadata = NULL, metadata = list()), ranges = new("IRanges",
start = c(21882L, 21882L, 21882L, 21882L, 22697L, 22697L),
width = c(126L, 126L, 126L, 126L, 60L, 60L), NAMES = NULL,
elementType = "ANY", elementMetadata = NULL, metadata = list()),
strand = new("Rle", values = structure(2L, .Label = c("+",
"-", "*"), class = "factor"), lengths = 6L, elementMetadata = NULL,
metadata = list()), seqinfo = new("Seqinfo", seqnames = c("Bch01", "Bch02", "Bch03", "Bch04", "Bch05"), seqlengths = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), is_circular = c(NA, NA, NA, NA, NA), genome = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_)), elementMetadata = new("DFrame", rownames = NULL, nrows = 6L,
listData = list(source = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = "maker", class = "factor"), type = structure(c(1L,
2L, 3L, 4L, 1L, 2L), .Label = c("CDS", "exon", "gene",
"mRNA", "three_prime_UTR", "five_prime_UTR"), class = "factor"),
score = c(NA, NA, NA, 126, NA, NA), phase = c(0L,
NA, NA, NA, 0L, NA), ID = c("B024400.1.1:cds",
"B024400.1.1:exon:2", "B024400.1",
"B024400.1.1", "B008910.1.1:cds",
"B008910.1.1:exon:4"), Parent = new("CompressedCharacterList",
elementType = "character", elementMetadata = NULL,
metadata = list(), unlistData = c("B024400.1.1",
"B024400.1.1", "B024400.1",
"B008910.1.1", "B008910.1.1"
), partitioning = new("PartitioningByEnd", end = c(1L,
2L, 2L, 3L, 4L, 5L), NAMES = NULL, elementType = "ANY",
elementMetadata = NULL, metadata = list())),
Name = c(NA, NA, "B024400.1", "B024400.1.1",
NA, NA), Note = new("CompressedCharacterList", elementType = "character",
elementMetadata = NULL, metadata = list(), unlistData = c("Similar to B024400.1.1: LOW QUALITY:50S ribosomal protein L4, chloroplastic", "Similar to B024400.1.1: LOW QUALITY:50S ribosomal protein L4, chloroplastic"), partitioning = new("PartitioningByEnd", end = c(0L,
0L, 1L, 2L, 2L, 2L), NAMES = NULL, elementType = "ANY",
elementMetadata = NULL, metadata = list())),
ref_id = c(NA, NA, "B024400.1.1", "B024400.1.1",
NA, NA), Dbxref = new("CompressedCharacterList",
elementType = "character", elementMetadata = NULL,
metadata = list(), unlistData = character(0),
partitioning = new("PartitioningByEnd", end = c(0L,
0L, 0L, 0L, 0L, 0L), NAMES = NULL, elementType = "ANY",
elementMetadata = NULL, metadata = list())),
Ontology_term = new("CompressedCharacterList", elementType = "character",
elementMetadata = NULL, metadata = list(), unlistData = character(0),
partitioning = new("PartitioningByEnd", end = c(0L,
0L, 0L, 0L, 0L, 0L), NAMES = NULL, elementType = "ANY",
elementMetadata = NULL, metadata = list()))),
elementType = "ANY", elementMetadata = NULL, metadata = list()),
elementType = "ANY", metadata = list())
I want to merge both the files by column Name
. I tried:
GFF = rtracklayer::import("gene_models.gff")
merge_data<-merge(file.csv,GFF,by="Name")
But in the csv file, I have same Name
values for different Samples
such as B005230.2.1
for both Sample1
and Sample2
whereas it only occurs in GFF
file once. Because of this, the merge file is messed up. I will appreciate any help to fix this problem. Thank you!