I am trying to two data frames (df_a
and df_b
) in R (essentially I want to repopulate df_a
with the updated data contained within df_b
). The columns in df_b
are all present in df_a
. Within df_b
there is (important) redundancy in ref_transcript_name
, ref_transcript_id
, and ref_gene_name
, but all values of qry_transcript_id
are unique and have a one-to-one relationship with df_a
. My assumption here is that a left_join()
would do the trick. I've tried:
df_c <- left_join(df_a, df_b)
- heredf_c
is identical todf_b
df_c <- left_join(df_a, df_b, by = "qry_transcript_id")
- heredf_c
contains the three non-guide columns ofdf_b
as new columns ofdf_c
.
I'm clearly missing something fundamental about the join functions here, but essentially I want to populate (most of) the missing values in df_a
with the values from df_b
.
Here are my data:
dput(df_a)
structure(list(ref_gene_id = c("LOC108906895", NA, NA, "LOC108906894",
"LOC108906894", "LOC108906894", "LOC108906889", "LOC108906897",
"LOC108906897", NA, "LOC108906891", "LOC108906890", "LOC108906896",
NA, "LOC108906893", NA, "LOC108906892", "LOC108905349", "LOC108905349",
"LOC108905349", "LOC108905394", "LOC108905394", "LOC108905439",
"LOC108905439", "LOC108905439", "LOC108905439", "LOC108905439",
"LOC108905439", "LOC108905439", "LOC108905439", "LOC108905439",
"LOC108905439", "LOC108905439", "LOC108905439", "LOC108905350",
"LOC108905395", "LOC108905377", "LOC108905377", "LOC108905399",
"LOC108905399", "LOC108905452", "LOC108905450", "LOC108905450",
"LOC108905450", "LOC108905425", "LOC108905427", "LOC108905429",
"LOC108905426", "LOC108905352", "LOC108905375", "LOC108905391",
NA, NA, NA), qry_gene_id = structure(c(1L, 2L, 2L, 3L, 3L, 3L,
4L, 5L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 14L, 14L,
15L, 15L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 17L, 18L, 19L, 19L, 20L, 20L, 21L, 23L, 23L, 23L, 22L, 24L,
27L, 25L, 26L, 28L, 29L, 30L, 31L, 32L), .Label = c("G229", "G230",
"G232", "G233", "G234", "G235", "G236", "G237", "G238", "G239",
"G240", "G241", "G242", "G243", "G244", "G245", "G246", "G247",
"G248", "G249", "G250", "G251", "G252", "G253", "G254", "G255",
"G256", "G257", "G258", "G259", "G260", "G261"), class = "factor"),
ref_gene_name = c("uncharacterized LOC108906895", NA, NA,
"uncharacterized LOC108906894", "uncharacterized LOC108906894",
"uncharacterized LOC108906894", "myosin regulatory light chain sqh",
"sushi, von Willebrand factor type A, EGF and pentraxin domain-containing protein 1",
"sushi, von Willebrand factor type A, EGF and pentraxin domain-containing protein 1",
NA, "uncharacterized LOC108906891", "protein twisted gastrulation",
"paraplegin", NA, "fork head domain-containing protein crocodile",
NA, "forkhead box protein F1-like", "centrosomal protein of 135 kDa-like",
"centrosomal protein of 135 kDa-like", "centrosomal protein of 135 kDa-like",
"nuclear transcription factor Y subunit alpha", "nuclear transcription factor Y subunit alpha",
"homeodomain-interacting protein kinase 2", "homeodomain-interacting protein kinase 2",
"homeodomain-interacting protein kinase 2", "homeodomain-interacting protein kinase 2",
"homeodomain-interacting protein kinase 2", "homeodomain-interacting protein kinase 2",
"homeodomain-interacting protein kinase 2", "homeodomain-interacting protein kinase 2",
"homeodomain-interacting protein kinase 2", "homeodomain-interacting protein kinase 2",
"homeodomain-interacting protein kinase 2", "homeodomain-interacting protein kinase 2",
"myb-like protein X", "uncharacterized LOC108905395", "uncharacterized LOC108905377",
"uncharacterized LOC108905377", "uncharacterized LOC108905399",
"uncharacterized LOC108905399", "uncharacterized LOC108905452",
"uncharacterized LOC108905450", "uncharacterized LOC108905450",
"uncharacterized LOC108905450", "uncharacterized LOC108905425",
"N-alpha-acetyltransferase 38, NatC auxiliary subunit", "cytochrome c oxidase assembly factor 6 homolog",
"N-alpha-acetyltransferase 30A", "ESF1 homolog", "atypical kinase COQ8B, mitochondrial",
"calphotin-like", NA, NA, NA), gene_annotation = c("refseq",
"novel", "novel", "refseq", "refseq", "refseq", "refseq",
"refseq", "refseq", "novel", "refseq", "refseq", "refseq",
"novel", "refseq", "novel", "refseq", "refseq", "refseq",
"refseq", "refseq", "refseq", "refseq", "refseq", "refseq",
"refseq", "refseq", "refseq", "refseq", "refseq", "refseq",
"refseq", "refseq", "refseq", "refseq", "refseq", "refseq",
"refseq", "refseq", "refseq", "refseq", "refseq", "refseq",
"refseq", "refseq", "refseq", "refseq", "refseq", "refseq",
"refseq", "refseq", "novel", "novel", "novel"), ref_transcript_id = c("XR_001964310.2",
NA, NA, "XR_001964308.1", "XR_001964308.1", "XR_001964308.1",
"XM_018710327.1", "XM_018710334.2", "XM_018710334.2", NA,
"XM_018710330.2", "XM_018710328.1", "XM_018710333.1", NA,
"XM_018710332.1", NA, "XM_018710331.1", "XM_018708179.2",
"XM_018708179.2", "XM_018708179.2", "XM_018708228.2", "XM_018708229.2",
"XM_018708292.1", "XM_018708292.1", "XM_023457437.1", "XM_018708292.1",
"XM_018708292.1", "XM_018708292.1", "XM_018708292.1", "XM_018708292.1",
"XM_018708299.1", "XM_018708292.1", "XM_018708292.1", "XM_018708292.1",
"XM_018708180.1", "XM_018708231.1", "XM_018708208.2", "XM_023453940.1",
"XM_018708235.2", "XM_018708235.2", "XM_018708321.1", "XM_018708319.1",
"XM_018708318.1", "XM_018708318.1", "XM_018708263.1", "XM_018708266.1",
"XM_018708267.1", "XM_018708265.1", "XM_018708181.2", "XM_018708205.1",
"XM_018708226.1", NA, NA, NA), qry_transcript_id = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
28L, 29L, 30L, 32L, 27L, 31L, 33L, 34L, 35L, 36L, 37L, 38L,
39L, 40L, 41L, 43L, 44L, 45L, 42L, 46L, 49L, 47L, 48L, 50L,
51L, 52L, 53L, 54L), .Label = c("TU429", "TU430", "TU431",
"TU435", "TU436", "TU437", "TU438", "TU439", "TU440", "TU441",
"TU442", "TU443", "TU444", "TU445", "TU446", "TU447", "TU448",
"TU449", "TU450", "TU451", "TU452", "TU453", "TU454", "TU455",
"TU456", "TU457", "TU458", "TU459", "TU460", "TU461", "TU462",
"TU463", "TU464", "TU465", "TU466", "TU467", "TU468", "TU469",
"TU470", "TU471", "TU472", "TU473", "TU474", "TU475", "TU476",
"TU477", "TU478", "TU479", "TU480", "TU481", "TU482", "TU483",
"TU484", "TU485"), class = "factor"), ref_transcript_name = structure(c(30L,
NA, NA, 29L, 29L, 29L, 12L, 19L, 19L, NA, 28L, 18L, 17L,
NA, 6L, NA, 7L, 3L, 3L, 3L, 15L, 16L, 8L, 8L, 9L, 8L, 8L,
8L, 8L, 8L, 10L, 8L, 8L, 8L, 11L, 22L, 20L, 21L, 23L, 23L,
27L, 26L, 25L, 25L, 24L, 14L, 4L, 13L, 5L, 1L, 2L, NA, NA,
NA), .Label = c("atypical kinase COQ8B, mitochondrial", "calphotin-like",
"centrosomal protein of 135 kDa-like", "cytochrome c oxidase assembly factor 6 homolog, transcript variant X1",
"ESF1 homolog", "fork head domain-containing protein crocodile",
"forkhead box protein F1-like", "homeodomain-interacting protein kinase 2, transcript variant X1",
"homeodomain-interacting protein kinase 2, transcript variant X11",
"homeodomain-interacting protein kinase 2, transcript variant X8",
"myb-like protein X", "myosin regulatory light chain sqh",
"N-alpha-acetyltransferase 30A", "N-alpha-acetyltransferase 38, NatC auxiliary subunit",
"nuclear transcription factor Y subunit alpha, transcript variant X1",
"nuclear transcription factor Y subunit alpha, transcript variant X2",
"paraplegin", "protein twisted gastrulation", "sushi, von Willebrand factor type A, EGF and pentraxin domain-containing protein 1",
"uncharacterized LOC108905377, transcript variant X1", "uncharacterized LOC108905377, transcript variant X2",
"uncharacterized LOC108905395", "uncharacterized LOC108905399",
"uncharacterized LOC108905425, transcript variant X1", "uncharacterized LOC108905450, transcript variant X1",
"uncharacterized LOC108905450, transcript variant X2", "uncharacterized LOC108905452, transcript variant X2",
"uncharacterized LOC108906891", "uncharacterized LOC108906894",
"uncharacterized LOC108906895, transcript variant X2"), class = "factor"),
transcript_annotation = c("refseq", "novel", "novel", "refseq",
"refseq", "novel", "refseq", "refseq", "novel", "novel",
"novel", "novel", "refseq", "novel", "refseq", "novel", "refseq",
"refseq", "novel", "novel", "refseq", "refseq", "novel",
"novel", "refseq", "novel", "novel", "novel", "novel", "novel",
"refseq", "novel", "novel", "novel", "refseq", "refseq",
"refseq", "refseq", "refseq", "novel", "refseq", "refseq",
"refseq", "novel", "refseq", "refseq", "refseq", "novel",
"refseq", "refseq", "refseq", "novel", "novel", "novel"),
class_code = structure(c(1L, 5L, 5L, 1L, 4L, 3L, 1L, 4L,
3L, 5L, 3L, 3L, 1L, 5L, 1L, 5L, 1L, 1L, 3L, 3L, 1L, 1L, 3L,
3L, 1L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L,
1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 2L, 5L, 5L,
5L), .Label = c("=", "c", "j", "k", "u"), class = "factor")), row.names = 432:485, class = "data.frame")
dput(df_b)
structure(list(qry_transcript_id = structure(1:10, .Label = c("TU118",
"TU151", "TU255", "TU417", "TU430", "TU431", "TU485", "TU543",
"TU687", "TU807"), class = "factor"), ref_transcript_name = structure(c(8L,
1L, 2L, 7L, 4L, 4L, NA, 5L, 6L, 3L), .Label = c("apoptosis-stimulating of p53 protein 1 isoform X3",
"basic proline-rich protein-like", "microtubule-associated protein 10-like",
"protein dopey homolog PFC0245c-like", "protein sprint isoform X2",
"serine/arginine repetitive matrix protein 2-like", "tigger transposable element-derived protein 1-like",
"uncharacterized protein LOC108904829"), class = "factor"), ref_transcript_id = c("XP_018563024",
"XP_023014054", "XP_019880584", "XP_018578361", "XP_024947529",
"XP_024947524", NA, "XP_030753146", "XP_018575004", "XP_023028347"
), ref_gene_name = c("* uncharacterized protein LOC108904829",
"* apoptosis-stimulating of p53 protein 1 ", "* basic proline-rich protein-like",
"* tigger transposable element-derived protein 1-like", "* protein dopey homolog PFC0245c-like",
"* protein dopey homolog PFC0245c-like", NA, "* protein sprint ",
"* serine/arginine repetitive matrix protein 2-like", "* microtubule-associated protein 10-like"
)), row.names = c(NA, 10L), class = "data.frame")
Hopefully my subsetting doesn't create problems here, but in the full data set, all qry_transcript_ids
in df_b
are contained with df_a
.