0

I want the pval.genomic.elements.1a dataframe to contain only the P.Value column of four other dataframes (df.x3utr.1a, df.x5utr.1a, df.cds.1a, and df.promoter.1a) with different row lengths. If a row is absent in one of the dataframe, then assign it as NA. I then want to assign row names based on the corresponding external_gene_name column.

pval.genomic.elements.1a <- rbind(df.x3utr.1a$P.Value, df.x5utr.1a$P.Value, df.cds.1a$P.Value, df.promoter.1a$P.Value)
pval.genomic.elements.1a <- as.data.frame(t(pval.genomic.elements.1a))
colnames(pval.genomic.elements.1a) <- c("X3UTR", "X5UTR", "CDS", "promCore")
rownames(pval.genomic.elements.1a) <- list(df.x3utr.1a$external_gene_name, df.x5utr.1a$external_gene_name, df.cds.1a$external_gene_name, df.promoter.1a$external_gene_name)

Traceback:

Error in `.rowNamesDF<-`(x, value = value) : invalid 'row.names' length

Data:

> dput(df.x3utr.1a)
structure(list(seqnames = structure(c(12L, 19L, 7L, 12L, 23L), levels = c("chr1", 
"chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", 
"chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", 
"chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", 
"chrY"), class = "factor"), start = c(9067664L, 1039997L, 87401696L, 
122920951L, 153724856L), end = c(9116229L, 1065572L, 87480435L, 
122981649L, 153744755L), width = c(48566L, 25576L, 78740L, 60699L, 
19900L), strand = structure(c(2L, 1L, 2L, 2L, 1L), levels = c("+", 
"-", "*"), class = "factor"), ensembl_gene_id = c("ENSG00000175899", 
"ENSG00000064687", "ENSG00000005471", "ENSG00000150967", "ENSG00000101986"
), external_gene_name = c("A2M", "ABCA7", "ABCB4", "ABCB9", "ABCD1"
), P.Value = c(4.69416065754249e-42, 6.0513299332676e-38, 4.24604839059402e-37, 
2.89877790887805e-36, 2.85123238031875e-33), annot.seqnames = structure(c(12L, 
19L, 7L, 12L, 23L), levels = c("chr1", "chr2", "chr3", "chr4", 
"chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", 
"chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", 
"chr20", "chr21", "chr22", "chrX", "chrY", "chrM", "chr1_gl000191_random", 
"chr1_gl000192_random", "chr4_ctg9_hap1", "chr4_gl000193_random", 
"chr4_gl000194_random", "chr6_apd_hap1", "chr6_cox_hap2", "chr6_dbb_hap3", 
"chr6_mann_hap4", "chr6_mcf_hap5", "chr6_qbl_hap6", "chr6_ssto_hap7", 
"chr7_gl000195_random", "chr8_gl000196_random", "chr8_gl000197_random", 
"chr9_gl000198_random", "chr9_gl000199_random", "chr9_gl000200_random", 
"chr9_gl000201_random", "chr11_gl000202_random", "chr17_ctg5_hap1", 
"chr17_gl000203_random", "chr17_gl000204_random", "chr17_gl000205_random", 
"chr17_gl000206_random", "chr18_gl000207_random", "chr19_gl000208_random", 
"chr19_gl000209_random", "chr21_gl000210_random", "chrUn_gl000211", 
"chrUn_gl000212", "chrUn_gl000213", "chrUn_gl000214", "chrUn_gl000215", 
"chrUn_gl000216", "chrUn_gl000217", "chrUn_gl000218", "chrUn_gl000219", 
"chrUn_gl000220", "chrUn_gl000221", "chrUn_gl000222", "chrUn_gl000223", 
"chrUn_gl000224", "chrUn_gl000225", "chrUn_gl000226", "chrUn_gl000227", 
"chrUn_gl000228", "chrUn_gl000229", "chrUn_gl000230", "chrUn_gl000231", 
"chrUn_gl000232", "chrUn_gl000233", "chrUn_gl000234", "chrUn_gl000235", 
"chrUn_gl000236", "chrUn_gl000237", "chrUn_gl000238", "chrUn_gl000239", 
"chrUn_gl000240", "chrUn_gl000241", "chrUn_gl000242", "chrUn_gl000243", 
"chrUn_gl000244", "chrUn_gl000245", "chrUn_gl000246", "chrUn_gl000247", 
"chrUn_gl000248", "chrUn_gl000249"), class = "factor"), annot.start = c(9086466L, 
1042574L, 87445577L, 122956146L, 153733327L), annot.end = c(9086608L, 
1042743L, 87445577L, 122958043L, 153735141L), annot.width = c(143L, 
170L, 1L, 1898L, 1815L), annot.strand = structure(c(1L, 1L, 1L, 
2L, 2L), levels = c("+", "-", "*"), class = "factor"), annot.id = c("3UTR:35638", 
"3UTR:50578", "3UTR:22629", "3UTR:38521", "3UTR:60645"), annot.tx_id = c("uc010sgn.1", 
"uc010dsa.3", "uc011khd.1", "uc001ucl.3", "uc004fls.2"), annot.gene_id = c("1911", 
"10347", "154661", "55596", "60343"), annot.symbol = c("PHC1", 
"ABCA7", "RUNDC3B", "ZCCHC8", "FAM3A"), annot.type = c("hg19_genes_3UTRs", 
"hg19_genes_3UTRs", "hg19_genes_3UTRs", "hg19_genes_3UTRs", "hg19_genes_3UTRs"
)), row.names = c(233L, 818L, 934L, 1128L, 1380L), class = "data.frame")

> dput(df.x5utr.1a)
structure(list(seqnames = structure(c(12L, 12L, 6L, 19L, 12L), levels = c("chr1", 
"chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", 
"chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", 
"chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", 
"chrY"), class = "factor"), start = c(9067664L, 53307456L, 44298731L, 
1039997L, 122920951L), end = c(9116229L, 53324864L, 44313347L, 
1065572L, 122981649L), width = c(48566L, 17409L, 14617L, 25576L, 
60699L), strand = structure(c(2L, 2L, 2L, 1L, 2L), levels = c("+", 
"-", "*"), class = "factor"), ensembl_gene_id = c("ENSG00000175899", 
"ENSG00000094914", "ENSG00000124608", "ENSG00000064687", "ENSG00000150967"
), external_gene_name = c("A2M", "AAAS", "AARS2", "ABCA7", "ABCB9"
), P.Value = c(4.69416065754249e-42, 1.39715882764931e-40, 4.61528350616333e-40, 
6.0513299332676e-38, 2.89877790887805e-36), annot.seqnames = structure(c(12L, 
12L, 6L, 19L, 12L), levels = c("chr1", "chr2", "chr3", "chr4", 
"chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", 
"chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", 
"chr20", "chr21", "chr22", "chrX", "chrY", "chrM", "chr1_gl000191_random", 
"chr1_gl000192_random", "chr4_ctg9_hap1", "chr4_gl000193_random", 
"chr4_gl000194_random", "chr6_apd_hap1", "chr6_cox_hap2", "chr6_dbb_hap3", 
"chr6_mann_hap4", "chr6_mcf_hap5", "chr6_qbl_hap6", "chr6_ssto_hap7", 
"chr7_gl000195_random", "chr8_gl000196_random", "chr8_gl000197_random", 
"chr9_gl000198_random", "chr9_gl000199_random", "chr9_gl000200_random", 
"chr9_gl000201_random", "chr11_gl000202_random", "chr17_ctg5_hap1", 
"chr17_gl000203_random", "chr17_gl000204_random", "chr17_gl000205_random", 
"chr17_gl000206_random", "chr18_gl000207_random", "chr19_gl000208_random", 
"chr19_gl000209_random", "chr21_gl000210_random", "chrUn_gl000211", 
"chrUn_gl000212", "chrUn_gl000213", "chrUn_gl000214", "chrUn_gl000215", 
"chrUn_gl000216", "chrUn_gl000217", "chrUn_gl000218", "chrUn_gl000219", 
"chrUn_gl000220", "chrUn_gl000221", "chrUn_gl000222", "chrUn_gl000223", 
"chrUn_gl000224", "chrUn_gl000225", "chrUn_gl000226", "chrUn_gl000227", 
"chrUn_gl000228", "chrUn_gl000229", "chrUn_gl000230", "chrUn_gl000231", 
"chrUn_gl000232", "chrUn_gl000233", "chrUn_gl000234", "chrUn_gl000235", 
"chrUn_gl000236", "chrUn_gl000237", "chrUn_gl000238", "chrUn_gl000239", 
"chrUn_gl000240", "chrUn_gl000241", "chrUn_gl000242", "chrUn_gl000243", 
"chrUn_gl000244", "chrUn_gl000245", "chrUn_gl000246", "chrUn_gl000247", 
"chrUn_gl000248", "chrUn_gl000249"), class = "factor"), annot.start = c(9070226L, 
53320234L, 44310397L, 1040102L, 122962813L), annot.end = c(9070273L, 
53320253L, 44310497L, 1040195L, 122963080L), annot.width = c(48L, 
20L, 101L, 94L, 268L), annot.strand = structure(c(1L, 2L, 1L, 
1L, 2L), levels = c("+", "-", "*"), class = "factor"), annot.id = c("5UTR:63551", 
"5UTR:67226", "5UTR:35114", "5UTR:90129", "5UTR:68634"), annot.tx_id = c("uc001qvc.1", 
"uc009zmk.1", "uc021yzz.1", "uc010dsa.3", "uc001ucl.3"), annot.gene_id = c("1911", 
"3856", "221409", "10347", "55596"), annot.symbol = c("PHC1", 
"KRT8", "SPATS1", "ABCA7", "ZCCHC8"), annot.type = c("hg19_genes_5UTRs", 
"hg19_genes_5UTRs", "hg19_genes_5UTRs", "hg19_genes_5UTRs", "hg19_genes_5UTRs"
)), row.names = c(58L, 251L, 266L, 520L, 984L), class = "data.frame")

> dput(df.cds.1a)
structure(list(seqnames = structure(c(12L, 12L, 6L, 17L, 19L), levels = c("chr1", 
"chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", 
"chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", 
"chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", 
"chrY"), class = "factor"), start = c(9067664L, 53307456L, 44298731L, 
42950526L, 1039997L), end = c(9116229L, 53324864L, 44313347L, 
42964498L, 1065572L), width = c(48566L, 17409L, 14617L, 13973L, 
25576L), strand = structure(c(2L, 2L, 2L, 2L, 1L), levels = c("+", 
"-", "*"), class = "factor"), ensembl_gene_id = c("ENSG00000175899", 
"ENSG00000094914", "ENSG00000124608", "ENSG00000266967", "ENSG00000064687"
), external_gene_name = c("A2M", "AAAS", "AARS2", "AARSD1", "ABCA7"
), P.Value = c(4.69416065754249e-42, 1.39715882764931e-40, 4.61528350616333e-40, 
1.41708178632472e-39, 6.0513299332676e-38), annot.seqnames = structure(c(12L, 
12L, 6L, 17L, 19L), levels = c("chr1", "chr2", "chr3", "chr4", 
"chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", 
"chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", 
"chr20", "chr21", "chr22", "chrX", "chrY", "chrM", "chr1_gl000191_random", 
"chr1_gl000192_random", "chr4_ctg9_hap1", "chr4_gl000193_random", 
"chr4_gl000194_random", "chr6_apd_hap1", "chr6_cox_hap2", "chr6_dbb_hap3", 
"chr6_mann_hap4", "chr6_mcf_hap5", "chr6_qbl_hap6", "chr6_ssto_hap7", 
"chr7_gl000195_random", "chr8_gl000196_random", "chr8_gl000197_random", 
"chr9_gl000198_random", "chr9_gl000199_random", "chr9_gl000200_random", 
"chr9_gl000201_random", "chr11_gl000202_random", "chr17_ctg5_hap1", 
"chr17_gl000203_random", "chr17_gl000204_random", "chr17_gl000205_random", 
"chr17_gl000206_random", "chr18_gl000207_random", "chr19_gl000208_random", 
"chr19_gl000209_random", "chr21_gl000210_random", "chrUn_gl000211", 
"chrUn_gl000212", "chrUn_gl000213", "chrUn_gl000214", "chrUn_gl000215", 
"chrUn_gl000216", "chrUn_gl000217", "chrUn_gl000218", "chrUn_gl000219", 
"chrUn_gl000220", "chrUn_gl000221", "chrUn_gl000222", "chrUn_gl000223", 
"chrUn_gl000224", "chrUn_gl000225", "chrUn_gl000226", "chrUn_gl000227", 
"chrUn_gl000228", "chrUn_gl000229", "chrUn_gl000230", "chrUn_gl000231", 
"chrUn_gl000232", "chrUn_gl000233", "chrUn_gl000234", "chrUn_gl000235", 
"chrUn_gl000236", "chrUn_gl000237", "chrUn_gl000238", "chrUn_gl000239", 
"chrUn_gl000240", "chrUn_gl000241", "chrUn_gl000242", "chrUn_gl000243", 
"chrUn_gl000244", "chrUn_gl000245", "chrUn_gl000246", "chrUn_gl000247", 
"chrUn_gl000248", "chrUn_gl000249"), class = "factor"), annot.start = c(9070274L, 
53320196L, 44310833L, 42963953L, 1041361L), annot.end = c(9070387L, 
53320233L, 44310971L, 42964118L, 1041426L), annot.width = c(114L, 
38L, 139L, 166L, 66L), annot.strand = structure(c(1L, 2L, 1L, 
2L, 1L), levels = c("+", "-", "*"), class = "factor"), annot.id = c("CDS:337772", 
"CDS:357425", "CDS:188640", "CDS:464055", "CDS:479535"), annot.tx_id = c("uc001qvc.1", 
"uc009zmk.1", "uc021yzz.1", "uc031rav.1", "uc010dsa.3"), annot.gene_id = c("1911", 
"3856", "221409", "9343", "10347"), annot.symbol = c("PHC1", 
"KRT8", "SPATS1", "EFTUD2", "ABCA7"), annot.type = c("hg19_genes_cds", 
"hg19_genes_cds", "hg19_genes_cds", "hg19_genes_cds", "hg19_genes_cds"
)), row.names = c(5L, 250L, 264L, 284L, 430L), class = "data.frame")

> dput(df.promoter.1a)
structure(list(seqnames = structure(c(12L, 12L, 6L, 19L, 12L), levels = c("chr1", 
"chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", 
"chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", 
"chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", 
"chrY"), class = "factor"), start = c(9067664L, 53307456L, 44298731L, 
1039997L, 122920951L), end = c(9116229L, 53324864L, 44313347L, 
1065572L, 122981649L), width = c(48566L, 17409L, 14617L, 25576L, 
60699L), strand = structure(c(2L, 2L, 2L, 1L, 2L), levels = c("+", 
"-", "*"), class = "factor"), ensembl_gene_id = c("ENSG00000175899", 
"ENSG00000094914", "ENSG00000124608", "ENSG00000064687", "ENSG00000150967"
), external_gene_name = c("A2M", "AAAS", "AARS2", "ABCA7", "ABCB9"
), P.Value = c(4.69416065754249e-42, 1.39715882764931e-40, 4.61528350616333e-40, 
6.0513299332676e-38, 2.89877790887805e-36), annot.seqnames = structure(c(12L, 
12L, 6L, 19L, 12L), levels = c("chr1", "chr2", "chr3", "chr4", 
"chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", 
"chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", 
"chr20", "chr21", "chr22", "chrX", "chrY", "chrM", "chr1_gl000191_random", 
"chr1_gl000192_random", "chr4_ctg9_hap1", "chr4_gl000193_random", 
"chr4_gl000194_random", "chr6_apd_hap1", "chr6_cox_hap2", "chr6_dbb_hap3", 
"chr6_mann_hap4", "chr6_mcf_hap5", "chr6_qbl_hap6", "chr6_ssto_hap7", 
"chr7_gl000195_random", "chr8_gl000196_random", "chr8_gl000197_random", 
"chr9_gl000198_random", "chr9_gl000199_random", "chr9_gl000200_random", 
"chr9_gl000201_random", "chr11_gl000202_random", "chr17_ctg5_hap1", 
"chr17_gl000203_random", "chr17_gl000204_random", "chr17_gl000205_random", 
"chr17_gl000206_random", "chr18_gl000207_random", "chr19_gl000208_random", 
"chr19_gl000209_random", "chr21_gl000210_random", "chrUn_gl000211", 
"chrUn_gl000212", "chrUn_gl000213", "chrUn_gl000214", "chrUn_gl000215", 
"chrUn_gl000216", "chrUn_gl000217", "chrUn_gl000218", "chrUn_gl000219", 
"chrUn_gl000220", "chrUn_gl000221", "chrUn_gl000222", "chrUn_gl000223", 
"chrUn_gl000224", "chrUn_gl000225", "chrUn_gl000226", "chrUn_gl000227", 
"chrUn_gl000228", "chrUn_gl000229", "chrUn_gl000230", "chrUn_gl000231", 
"chrUn_gl000232", "chrUn_gl000233", "chrUn_gl000234", "chrUn_gl000235", 
"chrUn_gl000236", "chrUn_gl000237", "chrUn_gl000238", "chrUn_gl000239", 
"chrUn_gl000240", "chrUn_gl000241", "chrUn_gl000242", "chrUn_gl000243", 
"chrUn_gl000244", "chrUn_gl000245", "chrUn_gl000246", "chrUn_gl000247", 
"chrUn_gl000248", "chrUn_gl000249"), class = "factor"), annot.start = c(9102358L, 
53320254L, 44309397L, 1039102L, 122963081L), annot.end = c(9103357L, 
53321253L, 44310396L, 1040101L, 122964080L), annot.width = c(1000L, 
1000L, 1000L, 1000L, 1000L), annot.strand = structure(c(2L, 2L, 
1L, 1L, 2L), levels = c("+", "-", "*"), class = "factor"), annot.id = c("promoter:47662", 
"promoter:48253", "promoter:24819", "promoter:65812", "promoter:49202"
), annot.tx_id = c("uc001qvf.3", "uc009zmk.1", "uc021yzz.1", 
"uc010dsa.3", "uc001ucl.3"), annot.gene_id = c("4074", "3856", 
"221409", "10347", "55596"), annot.symbol = c("M6PR", "KRT8", 
"SPATS1", "ABCA7", "ZCCHC8"), annot.type = c("hg19_genes_promoters", 
"hg19_genes_promoters", "hg19_genes_promoters", "hg19_genes_promoters", 
"hg19_genes_promoters")), row.names = c(1L, 248L, 258L, 417L, 
942L), class = "data.frame")
melolili
  • 1,237
  • 6
  • 16
  • 1
    Great that you shared a reproducible example, but if you want to maximize your chances of getting meaningful answers, I'd suggest to get a minimal data set. Try to boil down your problem to a few data sets, with minimal names and structure. See here for more:https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example – Maël May 04 '23 at 13:31
  • 3
    Thanks for providing the `dput` but the sample data are insufficient for us to examine the issue - they all have the same row length and are matrices not data frames (as described in the question). Good luck! – jpsmith May 04 '23 at 13:31
  • 1
    Note: There's no `P.Value` in the posted matrices. Maybe they're just parts of a complete data.frame. – Andre Wildberg May 04 '23 at 13:33
  • Sorry for the omission. I've edited my question. – melolili May 04 '23 at 14:47

1 Answers1

1

First, put the data frames in a list

df.list <- list("X3UTR" = df.x3utr.1a[,c("external_gene_name", "P.Value")], 
                "X5UTR" = df.x5utr.1a[,c("external_gene_name", "P.Value")], 
                "CDS" = df.cds.1a[,c("external_gene_name", "P.Value")], 
                "promCore" = df.promoter.1a[,c("external_gene_name", "P.Value")])

get the unique row names

uniq_nm <- unique(unlist(sapply(df.list, "[", "external_gene_name")))

finally get the desired data frame

setNames(data.frame(sapply(df.list, function(x){
  res <- data.frame(x["P.Value"], row.names=unlist(x["external_gene_name"]))
  data.frame(res[match(uniq_nm, rownames(res)),])}), row.names=uniq_nm),
    names(df.list))

Output

              X3UTR        X5UTR          CDS     promCore
A2M    4.694161e-42 4.694161e-42 4.694161e-42 4.694161e-42
ABCA7  6.051330e-38 6.051330e-38 6.051330e-38 6.051330e-38
ABCB4  4.246048e-37           NA           NA           NA
ABCB9  2.898778e-36 2.898778e-36           NA 2.898778e-36
ABCD1  2.851232e-33           NA           NA           NA
AAAS             NA 1.397159e-40 1.397159e-40 1.397159e-40
AARS2            NA 4.615284e-40 4.615284e-40 4.615284e-40
AARSD1           NA           NA 1.417082e-39           NA
Andre Wildberg
  • 12,344
  • 3
  • 12
  • 29
  • Thanks, Andre. How do I rename the columns as `c("X3UTR","X5UTR","CDS","promCore")` in that function? – melolili May 04 '23 at 15:18
  • @melolili I added a way using `setNames` which might be the easiest, since we already have the names in the *df.list*. Other approaches can be with `grep` and `colnames`. – Andre Wildberg May 04 '23 at 15:29