I am using R and I want to automate this process since I will be doing it to a large number of files. All the files have the same format (.vcf) and I want to change that to a small data.frame
. I have used
library(dplyr)
library(tidyr)
df <- read.table("DNA_rep1.vcf", sep="\t")
df$chromosome <- sub(pattern = "chr",replacement = "", df[,1])
df <- df %>% separate(V8,c("8.1", "8.2","8.3"), extra='drop')
df <- df[,c(13,2,9,6)]
df <- cbind(rep(29, nrow(df)), df)
Here the 29 ideally would be an argument of a function so I can change the sample ID when applying the function.
colnames(df) <- c("sample", "chromosome", "start", "end", "segVal")
Also, it would be good to add the generated data.frame
to the previously curated data, for example, prev.df <- rbind (prev.df, df)
and so and so on with all the files.
QUESTION SOLVED
library(dplyr)
library(tidyr)
library(string)
#I created a function first
vcf2cnv.df <- function(x){
a <- read.table(x, sep="\t")
#Only show chromosome number not "chr"
a$chromosome <- sub(pattern = "chr", replacement = "", a[,1])
#Extract the end position
a <- a %>% separate(V8,c("8.1", "8.2","8.3"), extra='drop')
#Keep the columns I need
a <- a %>% select(c(13,2,9,6))
#Extract the number from the file to create an ID
y <- as.numeric((str_extract_all(x, pattern = "[0-9]", simplify = TRUE)))
y <- paste(y, collapse = "")
y <- as.numeric(gsub('.{1}$', '', y))
a <- cbind(rep(y, nrow(a)), a)
#Set column names
colnames(a) <- c("sample", "chromosome", "start", "end", "segVal")
#Save file in working directory
write.csv(a, file = paste0(y, "_DNA_CopyNumberVariants.csv"))
}
##Now let's run this function to all files and combine them.
#Set Working Directory
setwd("/my/working/directory")
# Apply the function to all the files
file_vcf <- list.files(pattern = "*.vcf", full.names = TRUE)
lapply(file_vcf, vcf2cnv.df)
#Bind all the results in a single data.frame
file_csv <- list.files(pattern = "*.csv", full.names = TRUE)
for (file in file_csv){
# if the merged dataset doesn't exist, create it
if (!exists("Colon_cnv")){
Colon_cnv <- read.csv(file, header=TRUE)
}
# if the merged dataset does exist, append to it
if (exists("Colon_cnv")){
temp_dataset <-read.csv(file, header=TRUE)
Colon_cnv<-rbind(Colon_cnv, temp_dataset)
rm(temp_dataset)
}
}