0

Having long format of copy number like below in which each sample has its own copy number values (segVal) in its own genomic range

> head(long)
   chromosome     start       end segVal                       sample
1:       chr1   3218923 116319008      2 TCGA-05-4417-01A-22D-1854-01
2:       chr1 116324707 120523902      1 TCGA-05-4417-01A-22D-1854-01
3:       chr1 149879545 247812431      4 TCGA-05-4417-01A-22D-1854-01
4:       chr1   3218923 104393357      2 TCGA-06-0644-01A-02D-0310-01
5:       chr1 104418619 149879545      1 TCGA-06-0644-01A-02D-0310-01
6:       chr1 149885583 247812431      2 TCGA-06-0644-01A-02D-0310-01

How I can convert this to wide format so that samples are in column having their values (however here genomic ranges should be common if I am not wrong) like

> head(wide)
 chr     start       end TCGA-05-4417-01A-22D-1854-01 TCGA-06-0644-01A-02D-0310-01 TCGA-06-0644-01A-02D-0310-01
 chr1  24254002  24291000          2          2         2
 chr3  47421002  49068000          1          0         0
 chr4  69204002  70320000          0          0         1
 chr5  58263002  59785000          0          1         1
 chr6  29010002  33287000          2          2         2
 chr7 110240002 111354000          0          0         0
>
Mussa
  • 117
  • 7

1 Answers1

1

Does this work for you?

library(tidyr)

options(scipen = 999)

df <- structure(list(chromosome = c("chr1", "chr1", "chr1", "chr1", 
"chr1", "chr1"), start = c(3218923L, 116324707L, 149879545L, 
3218923L, 104418619L, 149885583L), end = c(116319008L, 120523902L, 
247812431L, 104393357L, 149879545L, 247812431L), segVal = c(2L, 
1L, 4L, 2L, 1L, 2L), sample = c("TCGA-05-4417-01A-22D-1854-01", 
"TCGA-05-4417-01A-22D-1854-01", "TCGA-05-4417-01A-22D-1854-01", 
"TCGA-06-0644-01A-02D-0310-01", "TCGA-06-0644-01A-02D-0310-01", 
"TCGA-06-0644-01A-02D-0310-01")), class = "data.frame", row.names = c("1:", 
"2:", "3:", "4:", "5:", "6:"))

df <- df %>% 
pivot_wider(names_from = sample, values_from = segVal, values_fill = 0)

#> # A tibble: 6 x 5
#>   chromosome    start      end `TCGA-05-4417-01A-22D-1… `TCGA-06-0644-01A-02D-0…
#>   <chr>         <int>    <int>                    <int>                    <int>
#> 1 chr1         3218923   116319008                    2                        0                        
#> 2 chr1         116324707 120523902                    1                        0
#> 3 chr1         149879545 247812431                    4                        0
#> 4 chr1         3218923   104393357                    0                        2
#> 5 chr1         104418619 149879545                    0                        1
#> 6 chr1         149885583 247812431                    0                        2

Created on 2020-08-24 by the reprex package (v0.3.0)

Eric
  • 2,699
  • 5
  • 17
  • 1
    You can specify how to fill in missing values using : `pivot_wider(names_from = sample, values_from = segVal, values_fill=0)` – Dealec Aug 24 '20 at 18:41