I have a list from
of data frames (dfA
and dfB
) with different number of rows:
# data frame A
IDA <- c("a", "a", "a")
Var1 <- c("1","4",".")
Var2 <- c("2"," ","8")
Var3 <- c("3","6","9")
# data frame B
IDB <- c("b", "b")
Var4 <- c("11","44")
Var5 <- c("22"," ")
Var6 <- c("33","66")
# Create data frames and check their structures
dfA <- data.frame(IDA, Var1, Var2, Var3)
is.data.frame(dfA)
dfB <- data.frame(IDB, Var4, Var5, Var6)
is.data.frame(dfB)
# Create a list of data frames
from <- list(dfA, dfB)
from
# Check its type
is.list(from)
# Read each elements of the list one by one
from[[1]]
from[[2]]
# Arrange only any single element of the list to get the desired structure:
trnsp.dfA <- t(c(t(from[[1]])))
trnsp.dfA
trnsp.dfB <- t(c(t(from[[2]])))
trnsp.dfB
But how to do this to each data frames in the list all at a time? If I understand correctly your code would return a list of rearranged data frames (in a "wide" format). Then I need to convert the list to a new data frame.
(Another issue is that all data frames in the list have similarly named variables (i.d. ID, Var1, Var2, Var3... for each df in the list). Here I can't reproduce this issue.)
Thank you.
My code is:
genSeq <- c('https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/alignments/A_gen.txt')
# Read raw data as character vector
a <- readLines(genSeq)
# Some diagnostics
# is.vector(a)
# typeof(a)
# length(a)
# Convert vector a to data frame b
b <- as.data.frame(a, stringsAsFactors = FALSE)
# is.data.frame(b)
# typeof(a)
# length(a)
# Install some packages
install.packages("stringr")
install.packages("stringi")
install.packages("xlsx")
# Load the packages
library(stringr)
library(stringi)
library(xlsx)
# Read the lines with nucleotide sequences
bb <- b[c(9:19762),]
# Some diagnostics
# head(bb)
# tail(bb)
# length(bb)
# typeof(bb)
# is.vector(bb)
# Split lines
d <- strsplit(bb, split = "")
# Some diagnostics
# head(d)
# tail(d)
# length(d)
# typeof(d)
# is.vector(d)
# Count number of variables ( http://stackoverflow.com/a/15201478/1009306 )
max.length <- max(sapply(d, length))
# Add NA values to list elements when the lists are shorter than others
d <- lapply(d, function(x) {c(x, rep(NA, max.length-length(x)))})
# Combine all elements
do.call(rbind, d)
# Some diagnostics
# head(d)
# tail(d)
# length(d)
# typeof(d)
# is.vector(d)
# Transform matrix
dd <- t(matrix(unlist(d),ncol=length(d)))
# Some diagnostics
# head(dd)
# tail(dd)
# is.matrix(dd)
# Transform existing dd matrix into ddd data frame
ddd <- as.data.frame(dd)
# Some diagnostics
# head(ddd)
# tail(ddd)
# is.data.frame(ddd)
# typeof(ddd)
# length(ddd)
# class(ddd)
# str(ddd)
# names(ddd)
# nrow(ddd)
# ncol(ddd)
# summary(ddd)
# Add new variable allel by concatenating values in existing variables V1...v19
ddd <- transform(ddd, allel = paste0(ddd$V1, ddd$V2, ddd$V3, ddd$V4, ddd$V5, ddd$V6, ddd$V7, ddd$V8, ddd$V9, ddd$V10, ddd$V11, ddd$V12, ddd$V13, ddd$V14, ddd$V15, ddd$V16, ddd$V17, ddd$V18, ddd$V19, sep = " "))
# Some diagnostics
# names(ddd)
# Reorder variable allel to be the first
new_ordered <- ddd[c(length(ddd),c(1:(length(ddd)-1)))]
# Some diagnostics
# names(new_ordered)
# ncol(new_ordered)
# Remove unnecessary variables V1...V19
new_ordered$V1 <- NULL
new_ordered$V2 <- NULL
new_ordered$V3 <- NULL
new_ordered$V4 <- NULL
new_ordered$V5 <- NULL
new_ordered$V6 <- NULL
new_ordered$V7 <- NULL
new_ordered$V8 <- NULL
new_ordered$V9 <- NULL
new_ordered$V10 <- NULL
new_ordered$V11 <- NULL
new_ordered$V12 <- NULL
new_ordered$V13 <- NULL
new_ordered$V14 <- NULL
new_ordered$V15 <- NULL
new_ordered$V16 <- NULL
new_ordered$V17 <- NULL
new_ordered$V18 <- NULL
new_ordered$V19 <- NULL
# Some diagnostics
# ncol(new_ordered)
# nrow(new_ordered)
# Remove rows containing NA ( http://stackoverflow.com/q/8005154/1009306 )
new_ordered <- subset(new_ordered, !(V50 == "NA" & V100 == "NA"))
# Some diagnostics
# head(new_ordered)
# ncol(new_ordered)
# nrow(new_ordered)
# Shrink whitespaces in allel names with the help of library(stringr)'s function:
new_ordered$allel <- gsub(" ", "", new_ordered$allel)
# The list of unique allels accordingly to LL*NN:NN(NL) template
#####
# Sort new_ordered data frame in an ascending order by allel variable
new_odrd_srtd <- new_ordered[order(new_ordered$allel),]
# Some diagnostics
# head(new_odrd_srtd)
# typeof(new_odrd_srtd)
# is.data.frame(new_odrd_srtd)
# The list of unique allel names
unique.allels <- unique(new_odrd_srtd$allel)
# Let the list to be a character vector
unique.allels <- as.character(unique.allels)
# Show them:
# unique.allels
# Their number is:
# length(unique.allels)
# Export them into MS Excel workbook:
# write.xlsx(unique.allels, file="d:/hla.xlsx", sheetName="01 unique.allels", append=TRUE)
# Extract the part of an allel name considering specific HLA protein only: LL*NN:NN(NL).
# The final point for the pattern of interest is cleared at http://r.789695.n4.nabble.com/Extract-part-of-string-tp4683108p4683111.html
specific.HLA.protein <- unique(gsub("^.*(\\A\\*[0-9A-Za-z]*\\:[0-9A-Za-z]*).*$", "\\1", unique.allels))
# Show them:
# specific.HLA.protein
# Their number is:
# length(specific.HLA.protein)
# Export them into _the same_ MS Excel workbook
# write.xlsx(specific.HLA.protein, file="d:/hla.xlsx", sheetName="02 specific.HLA.protein", append=TRUE)
##################################################################################
# Plan
#
# convert multiple rows per subject into single row
# Create data frame with these long rows
# Concatenate values of each variable into corresponding single cells of a new row
#
#
# Example for http://stackoverflow.com/q/42711357
#####
# data frame A
IDA <- c("a", "a", "a")
Var1 <- c("1","4",".")
Var2 <- c("2"," ","8")
Var3 <- c("3","6","9")
# data frame B
IDB <- c("b", "b")
Var4 <- c("11","44")
Var5 <- c("22"," ")
Var6 <- c("33","66")
# Create data frames and check their structures
dfA <- data.frame(IDA, Var1, Var2, Var3)
is.data.frame(dfA)
dfB <- data.frame(IDB, Var4, Var5, Var6)
is.data.frame(dfB)
# Create a list of data frames
from <- list(dfA, dfB)
from
# Check its type
is.list(from)
# Read each elements of the list one by one
from[[1]]
from[[2]]
# Arrange only any single element of the list to get the desired structure:
trnsp.dfA <- t(c(t(from[[1]])))
trnsp.dfA
trnsp.dfB <- t(c(t(from[[2]])))
trnsp.dfB
l2 <- lapply(from, function(i) t(c(t(i))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))
new_df <- setNames(data.frame(do.call(rbind, l2)), c('ID', paste0('Var', seq(max(lengths(l2))-1))))
new_df
# Some diagnostics
diagnostic <- new_df
head(diagnostic)
tail(diagnostic)
is.data.frame(diagnostic)
typeof(diagnostic)
length(diagnostic)
class(diagnostic)
str(diagnostic)
names(diagnostic)
nrow(diagnostic)
ncol(diagnostic)
summary(diagnostic)
##################################################################################
# End of Example
# Select strings only for A*01:01:01:01 allel
new_odrd_srtd_sbst <- subset(new_odrd_srtd, grepl("A\\*01:01:01*\\:*[0-9A-Za-z]", allel) )
# A regular expression for the pattern with spaces plus extra info:
# new_odrd_srtd_sbst <- subset(new_odrd_srtd, grepl("^.*(\\A\\*[0-9A-Za-z]*\\:0[1-2]).*$", allel) )
head(new_odrd_srtd_sbst)
unique(new_odrd_srtd_sbst$allel)
# Add new vaiable allelGroup_specific.HLA.protein by copying values in existing variable allel
new_odrd_srtd_sbst <- transform(new_odrd_srtd_sbst, allelGroup_specific.HLA.protein = paste0(new_odrd_srtd_sbst$allel))
# Reorder variables
new_odrd_srtd_sbst_added_ordrd <- new_odrd_srtd_sbst[c(length(new_odrd_srtd_sbst), c(1:(length(new_odrd_srtd_sbst)-1)))]
# Extract the part of an allel name considering specific HLA protein only: A*NN:NN(NL).
# The final point for the pattern of interest is cleared here: http://r.789695.n4.nabble.com/Extract-part-of-string-tp4683108p4683111.html
new_odrd_srtd_sbst_added_ordrd$allelGroup_specific.HLA.protein <- gsub("^.*(\\A\\*[0-9A-Za-z]*\\:[0-9A-Za-z]*).*$", "\\1", new_odrd_srtd_sbst_added_ordrd$allelGroup_specific.HLA.protein)
# Diagnostic
is.data.frame(new_odrd_srtd_sbst_added_ordrd)
typeof(new_odrd_srtd_sbst_added_ordrd)
# Split dataframe into a list of data frames based on a value in allel variable
# http://stackoverflow.com/q/18527051
ndf <- split(new_odrd_srtd_sbst_added_ordrd, new_odrd_srtd_sbst_added_ordrd$allel)
ndf[[1]][1:36,1:25]
# Diagnostic
is.data.frame(ndf)
typeof(ndf)
class(ndf)
length(ndf)
# From this step I fail to step further...