0

I have a file that contain sample chromosome and its frequencies :

 a
 sample   Chr_No   frequency
 sample-1  chr1:         0
 sample-1  chr2:         0
 sample-1  chr3:         0
 sample-1  chr4:         1
 sample-1  chr5:         0
 sample-1  chr6:         0
 sample-1  chr7:         0
 sample-1  chr8:         0
 sample-1  chr9:         1
 sample-1  chr10         0
 sample-1  chr11         0
 ......

I want to convert it as data frame so,I am using this in R :

 b <- dcast( a, Sample ~ Chr_No, value.var = "Frequency", fill = 0 )

This command is creating data frame but the arrangement of chromosome is different:

How can I remove ":" from Chr_No and arrange Chr_No as Chr1 Chr2 Chr3 ....... in data frame ?

Ronak Shah
  • 377,200
  • 20
  • 156
  • 213

2 Answers2

1

First remove the colon from the names and then use mixedsort to arrange names as chr1, chr2.

library(gtools)

names(b) <- sub(":", "", names(b))
cbind(b[1], b[-1][mixedsort(names(b[-1]))])


#    sample chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11
#1 sample-1    0    0    0    1    0    0    0    0    1     0     0

Or we can keep everything in base R and remove all the characters from the names keeping only the digits and ordering them after removing the colon

cbind(b[1], b[-1][order(as.numeric(gsub("[[:alpha:]]", "", names(b[-1]))))])


#    sample chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11
#1 sample-1    0    0    0    1    0    0    0    0    1     0     0
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
0

Another option to order before the dcast is to change it to factor column with levels specified after removing the : at the end of the string in 'Chr_No'

library(data.table)
setDT(a)[, Chr_No := factor(sub(':$', '', Chr_No), levels = paste0("chr", 1:11))]

Then, do the dcast

dcast( a, sample ~ Chr_No, value.var = "frequency", fill = 0 )
#     sample chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11
#1: sample-1    0    0    0    1    0    0    0    0    1     0     0

data

a <- structure(list(sample = c("sample-1", "sample-1", "sample-1", 
"sample-1", "sample-1", "sample-1", "sample-1", "sample-1", "sample-1", 
 "sample-1", "sample-1"), Chr_No = c("chr1:", "chr2:", "chr3:", 
 "chr4:", "chr5:", "chr6:", "chr7:", "chr8:", "chr9:", "chr10", 
 "chr11"), frequency = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 
 0L)), class = "data.frame", row.names = c(NA, -11L))
akrun
  • 874,273
  • 37
  • 540
  • 662