2

I have a data frame:

> head(bp_data)
event bp_no  sample chrom      bp       gene    feature type length          id fpkm
1   bp1 A373R11    2L 2425901 intergenic intergenic  INV    0.1        <NA>    0
1   bp2 A373R11    2L 2426025 intergenic intergenic  INV    0.1        <NA>    0
3   bp1 A373R11    2L 6694426        Tsp     intron  INV    0.1 FBgn0031850    0
3   bp2 A373R11    2L 6694566        Tsp     intron  INV    0.1 FBgn0031850    0
6   bp1 A373R11    2R 8387755       pdm3     intron  INV    0.2 FBgn0261588    0
6   bp2 A373R11    2R 8387927       pdm3     exon_2  INV    0.2 FBgn0261588    0

structure(list(event = c(1L, 1L, 3L, 3L, 6L, 6L), bp_no = structure(c(1L, 
2L, 1L, 2L, 1L, 2L), .Label = c("bp1", "bp2"), class = "factor"), 
    sample = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A373R11", 
    "A373R13", "A373R3", "A373R5", "A373R9", "A512R19", "A512R21", 
    "A512R23", "A573R25", "A573R27", "A573R29", "A573R31", "A573R33", 
    "B241R35", "B241R37", "B241R39", "B241R41a", "B241R43", "B241R45", 
    "B241R51", "B241R53", "B241R55", "B241R57", "B241R59", "B241R61", 
    "B241R63", "HUM-1", "HUM-4", "HUM-7"), class = "factor"), 
    chrom = structure(c(1L, 1L, 1L, 1L, 2L, 2L), .Label = c("2L", 
    "2R", "3L", "3R", "X", "Y"), class = "factor"), bp = c(2425901L, 
    2426025L, 6694426L, 6694566L, 8387755L, 8387927L), gene = structure(c(67L, 
    67L, 114L, 114L, 92L, 92L), .Label = c("5-HT7", "Ankle2", 
    "Arpc3B", "Atac3", "B4", "be", "bru3", "CalpB", "CanA1", 
    "CG12081", "CG12535", "CG13024", "CG13991", "CG1632", "CG17211", 
    "CG32121", "CG32191", "CG32447", "CG32549", "CG34356", "CG3520", 
    "CG3655", "CG4116", "CG42238", "CG42321", "CG42404", "CG43707", 
    "CG44838", "CG45002", "CG45263", "CG5004", "CG5535", "CG5910", 
    "CG6707", "CG6907", "CG6959", "CG7720", "CG7878", "CG8213", 
    "CG8216", "CG8861", "CG9416", "CG9821", "CG9837", "Cpr", 
    "CR32773", "CR44173", "CR44181", "CR44602", "CR44886", "CR45161", 
    "CR45814", "dco", "DIP-alpha", "DNApol-epsilon255", "dnc", 
    "dpr13", "dpr8", "ed", "elg1", "Fim", "Frl", "heph", "Hers", 
    "Hs6st", "Hsromega", "intergenic", "inv", "jp", "kirre", 
    "kkv", "klar", "kuz", "Lim1", "lola", "magu", "mamo", "Map205", 
    "mars", "mask", "mbl", "mnd", "Mnt", "mor", "N", "nAChRalpha3", 
    "nAChRbeta1", "nahoda", "Nhe2", "nvy", "Octbeta3R", "pdm3", 
    "Phax", "pico", "Pif1A", "Poxm", "pros", "Pzl", "rdx", "rhea", 
    "RhoGEF3", "Rim2", "Scp1", "Shab", "Slc45-1", "sm", "Snoo", 
    "Sox100B", "SPR", "Su(var)2-10", "Syn", "tefu", "Treh", "Tsp", 
    "TwdlJ", "TwdlK", "Upf3", "Vps52", "w", "wcy", "wdb", "WDY", 
    "Yeti"), class = "factor"), feature = structure(c(16L, 16L, 
    17L, 17L, 17L, 9L), .Label = c("3UTR", "5UTR", "CDS", "exon", 
    "exon_1", "exon_12", "exon_15", "exon_17", "exon_2", "exon_3", 
    "exon_4", "exon_5", "exon_6", "exon_7", "exon_9", "intergenic", 
    "intron", "ncRNA"), class = "factor"), type = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("INV", "DEL", "TRA", "DUP", 
    "BND", "TANDUP"), class = "factor"), length = c(0.1, 0.1, 
    0.1, 0.1, 0.2, 0.2), id = structure(c(NA, NA, 46L, 46L, 98L, 
    98L), .Label = c("FBgn0000038", "FBgn0000479", "FBgn0000547", 
    "FBgn0001234", "FBgn0001269", "FBgn0001311", "FBgn0001316", 
    "FBgn0002413", "FBgn0002645", "FBgn0002778", "FBgn0002783", 
    "FBgn0003129", "FBgn0003435", "FBgn0003612", "FBgn0003748", 
    "FBgn0003996", "FBgn0004573", "FBgn0004575", "FBgn0004595", 
    "FBgn0004647", "FBgn0005636", "FBgn0010015", "FBgn0011224", 
    "FBgn0015519", "FBgn0015623", "FBgn0020908", "FBgn0023215", 
    "FBgn0023407", "FBgn0024238", "FBgn0024288", "FBgn0025866", 
    "FBgn0026411", "FBgn0027492", "FBgn0028343", "FBgn0028369", 
    "FBgn0029649", "FBgn0029657", "FBgn0029768", "FBgn0030027", 
    "FBgn0030053", "FBgn0030812", "FBgn0031359", "FBgn0031710", 
    "FBgn0031711", "FBgn0031785", "FBgn0031850", "FBgn0032129", 
    "FBgn0032414", "FBgn0033358", "FBgn0033359", "FBgn0033380", 
    "FBgn0033845", "FBgn0034286", "FBgn0034438", "FBgn0034797", 
    "FBgn0034859", "FBgn0034923", "FBgn0035968", "FBgn0036058", 
    "FBgn0036574", "FBgn0036665", "FBgn0036764", "FBgn0036993", 
    "FBgn0037549", "FBgn0037635", "FBgn0037636", "FBgn0037676", 
    "FBgn0037956", "FBgn0038652", "FBgn0038755", "FBgn0039439", 
    "FBgn0039440", "FBgn0040297", "FBgn0040397", "FBgn0043884", 
    "FBgn0045035", "FBgn0052121", "FBgn0052191", "FBgn0052343", 
    "FBgn0052447", "FBgn0052529", "FBgn0052549", "FBgn0052594", 
    "FBgn0052600", "FBgn0052773", "FBgn0052791", "FBgn0065032", 
    "FBgn0085385", "FBgn0085450", "FBgn0250867", "FBgn0250910", 
    "FBgn0259221", "FBgn0259823", "FBgn0259984", "FBgn0260442", 
    "FBgn0260748", "FBgn0261015", "FBgn0261588", "FBgn0261811", 
    "FBgn0262169", "FBgn0262593", "FBgn0263846", "FBgn0264001", 
    "FBgn0264326", "FBgn0264493", "FBgn0264707", "FBgn0265062", 
    "FBgn0265070", "FBgn0265487", "FBgn0265813", "FBgn0266101", 
    "FBgn0266180", "FBgn0266354", "FBgn0266654", "FBgn0266801", 
    "FBgn0267033", "FBgn0267398", "FBgn0267430", "FBgn0267449", 
    "FBgn0267464", "FBgn0267795", "FBgn0283521"), class = "factor"), 
    fpkm = c(0, 0, 0, 0, 0, 0)), .Names = c("event", "bp_no", 
"sample", "chrom", "bp", "gene", "feature", "type", "length", 
"id", "fpkm"), row.names = c(NA, 6L), class = "data.frame")

Each event per sample has two bp values (bp1 and bp2), that are currently on separates rows, e.g.:

event bp_no  sample chrom      bp       gene    feature type length          id fpkm
1   bp1 A373R11    2L 2425901 intergenic intergenic  INV    0.1        <NA>    0
1   bp2 A373R11    2L 2426025 intergenic intergenic  INV    0.1        <NA>    0

I would like to combine the bp1 and bp2 values for each observation into a new data frame, e.g.:

event   sample  chrom    bp1    bp2    bp1_gene    bp2_gene    bp1_feature    bp2_feature    type    length    id    fpkm
1    A373R11    2L    2425901    2426025    intergenic    intergenic    intergenic    intergenic    INV    0.1    <NA>    0

Can anyone suggest a method to do this?

fugu
  • 6,417
  • 5
  • 40
  • 75
  • Should just be a basic long to wide reshape - `reshape(dat, idvar=c("event","sample","chrom"), timevar="bp_no", direction="wide")` – thelatemail Oct 26 '17 at 09:57
  • @thelatemail - `reshape(bp_data, idvar=c("event", "sample"), timevar="bp_no", direction="wide")` does the trick - thanks! – fugu Oct 26 '17 at 10:03

1 Answers1

2

Using dplyr and tidyr

bp_data %>%
  gather(field, value, -event, -bp_no, -sample, -chrom) %>%
  mutate(field = paste0(bp_no, "_", field)) %>%
  select(-bp_no) %>%
  spread(field, value) 

# event  sample chrom  bp1_bp bp1_feature bp1_fpkm   bp1_gene      bp1_id bp1_length bp1_type  bp2_bp bp2_feature bp2_fpkm   bp2_gene      bp2_id bp2_length bp2_type
# 1     1 A373R11    2L 2425901  intergenic        0 intergenic        <NA>        0.1      INV 2426025  intergenic        0 intergenic        <NA>        0.1      INV
# 2     3 A373R11    2L 6694426      intron        0        Tsp FBgn0031850        0.1      INV 6694566      intron        0        Tsp FBgn0031850        0.1      INV
# 3     6 A373R11    2R 8387755      intron        0       pdm3 FBgn0261588        0.2      INV 8387927      exon_2        0       pdm3 FBgn0261588        0.2      INV
Robin Gertenbach
  • 10,316
  • 3
  • 25
  • 37