0

I am attempting to learn/use dplyr in R and I am having a few issues with how too create a custom table concatenation.

In short, I have a table that looks like so:

structure(list(bamfile = structure(c(2L, 1L, 4L, 3L, 6L), .Label = c("CZ_25bL001s.bam", 
"CZ_25L001s.bam", "CZ_26bL001s.bam", "CZ_26L001s.bam", "CZ_27bL001s.bam", 
"CZ_27L001s.bam", "CZ_28bL001s.bam", "CZ_28L001s.bam", "CZ_29bL001s.bam", 
"CZ_29L001s.bam", "CZ_30bL001s.bam", "CZ_30L001s.bam", "CZ_31bL001s.bam", 
"CZ_31L001s.bam", "CZ_32bL001s.bam", "CZ_32L001s.bam", "CZ_33bL001s.bam", 
"CZ_33L001s.bam", "CZ_34bL001s.bam", "CZ_34L001s.bam", "CZ_35bL001s.bam", 
"CZ_35L001s.bam", "CZ_36bL001s.bam", "CZ_36L001s.bam"), class = "factor"), 
    directory = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "TestDirectory/DataFolder", class = "factor"), 
    Short.name = structure(1:5, .Label = c("CZ_25", "CZ_25b", 
    "CZ_26", "CZ_26b", "CZ_27", "CZ_27b", "CZ_28", "CZ_28b", 
    "CZ_29", "CZ_29b", "CZ_30", "CZ_30b", "CZ_31", "CZ_31b", 
    "CZ_32", "CZ_32b", "CZ_33", "CZ_33b", "CZ_34", "CZ_34b", 
    "CZ_35", "CZ_35b", "CZ_36", "CZ_36b"), class = "factor"), 
    Targeting.type = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "single", class = "factor"), 
    sgRNA1 = structure(c(1L, 4L, 6L, 7L, 7L), .Label = c("guide_16", 
    "guide_2", "guide_21", "guide_22", "guide_6", "guide_76", 
    "guide_83"), class = "factor"), sgRNA2 = c(NA, NA, NA, NA, 
    NA), Group = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "CZ", class = "factor")), .Names = c("bamfile", 
"directory", "Short.name", "Targeting.type", "sgRNA1", "sgRNA2", 
"Group"), row.names = c(NA, 5L), class = "data.frame")

What I would like to do is take a search function, and iterate through the "sgRNA1" column, and then create a new column for each "bamfile" that corresponds to that same row.

The final table would thus contain a guide_XX column, followed by a n() column with how many entries, and then an individual column for each .bam file that was found. Each row would then correspond to the next guide_XX sample that was iterated.

This would be an example table of the output:

structure(list(sgRNA1 = structure(1:6, .Label = c("guide_16", 
"guide_2", "guide_21", "guide_22", "guide_6", "guide_76", "guide_83"
), class = "factor"), Count = c(4L, 2L, 5L, 4L, 2L, 1L), bam1 = structure(c(2L, 
7L, 5L, 1L, 6L, 4L), .Label = c("CZ_25bL001s.bam", "CZ_25L001s.bam", 
"CZ_26bL001s.bam", "CZ_26L001s.bam", "CZ_29bL001s.bam", "CZ_30bL001s.bam", 
"CZ_30L001s.bam"), class = "factor"), bam2 = structure(c(3L, 
6L, 5L, 4L, 7L, 1L), .Label = c("", "CZ_27L001s.bam", "CZ_28bL001s.bam", 
"CZ_29L001s.bam", "CZ_31L001s.bam", "CZ_33bL001s.bam", "CZ_34L001s.bam"
), class = "factor"), bam3 = structure(c(4L, 1L, 5L, 3L, 1L, 
1L), .Label = c("", "CZ_27bL001s.bam", "CZ_32bL001s.bam", "CZ_32L001s.bam", 
"CZ_33L001s.bam"), class = "factor"), bam4 = structure(c(4L, 
1L, 3L, 5L, 1L, 1L), .Label = c("", "CZ_28L001s.bam", "CZ_34bL001s.bam", 
"CZ_35bL001s.bam", "CZ_36L001s.bam"), class = "factor"), bam5 = structure(c(1L, 
1L, 3L, 1L, 1L, 1L), .Label = c("", "CZ_31bL001s.bam", "CZ_36bL001s.bam"
), class = "factor"), bam6 = structure(c(1L, 1L, 1L, 1L, 1L, 
1L), .Label = c("", "CZ_35L001s.bam"), class = "factor")), .Names = c("sgRNA1", 
"Count", "bam1", "bam2", "bam3", "bam4", "bam5", "bam6"), row.names = c(NA, 
6L), class = "data.frame")

Thank you in advance! I'm looking forward to getting to know dplyr a bit better

rleenay
  • 11
  • 1
  • You need [to make your example reproducible](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example/5963610#5963610). – alistaire Oct 20 '17 at 15:41
  • Please share your data in a copy/pastable way instead of a picture. `dput()` is useful for this - [see here for a complete walkthrough](https://stackoverflow.com/q/5963269/903061). Also Please show a small sample of your desired output rather than just describing it. I don't know what you mean by "take a search function", nor where the `guide_XX` values come from. But I think you essentially want `reshape2::dcast(your_data, sgRNA1 ~ bamfile)`. – Gregor Thomas Oct 20 '17 at 15:43
  • I've updated the code with dput(), thank you! – rleenay Oct 20 '17 at 15:51
  • @Gregor your code is *close* to what I wanted my desired output to be. The only change would be that the column would not be specific to a certain variable/cell name, but be a general "1st cell that matches up", the second column would be a "2nd cell that matches up", with all of the columns after that reading "NA" if there are no more references – rleenay Oct 20 '17 at 16:05
  • Gotcha. You'll need to add a `guide_xx` column to the original data and then you can do `dcast(your_data, sgRNA1 ~ guide_xx, value.var = "bamfile")`. I don't have time to write a full answer now - maybe you can get it working based on that or maybe someone else will step in. You can add the `n` column in beforehand and use it in the cast as well, `sgRNA1 + n ~ guide_xx, ...` – Gregor Thomas Oct 20 '17 at 16:29

0 Answers0