3

I have a list from of data frames (dfA and dfB) with different number of rows:

# data frame A
IDA <- c("a", "a", "a")
Var1 <- c("1","4",".")
Var2 <- c("2"," ","8")
Var3 <- c("3","6","9")

# data frame B
IDB <- c("b", "b")
Var4 <- c("11","44")
Var5 <- c("22"," ")
Var6 <- c("33","66")

# Create data frames and check their structures    
dfA <- data.frame(IDA, Var1, Var2, Var3)
is.data.frame(dfA)
dfB <- data.frame(IDB, Var4, Var5, Var6)
is.data.frame(dfB)

# Create a list of data frames
from <- list(dfA, dfB)
from

# Check its type
is.list(from)

# Read each elements of the list one by one
from[[1]] 
from[[2]]

# Arrange only any single element of the list to get the desired structure:
trnsp.dfA <- t(c(t(from[[1]])))
trnsp.dfA
trnsp.dfB <- t(c(t(from[[2]])))
trnsp.dfB

But how to do this to each data frames in the list all at a time? If I understand correctly your code would return a list of rearranged data frames (in a "wide" format). Then I need to convert the list to a new data frame.

(Another issue is that all data frames in the list have similarly named variables (i.d. ID, Var1, Var2, Var3... for each df in the list). Here I can't reproduce this issue.)

Thank you.

My code is:

genSeq <-  c('https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/alignments/A_gen.txt')

# Read raw data as character vector
a <- readLines(genSeq)

# Some diagnostics
# is.vector(a)
# typeof(a)
# length(a)

# Convert vector a to data frame b
b <- as.data.frame(a, stringsAsFactors = FALSE)
# is.data.frame(b)
# typeof(a)
# length(a)

# Install some packages
  install.packages("stringr")
  install.packages("stringi")
  install.packages("xlsx")

# Load the packages
library(stringr)
library(stringi)
library(xlsx)

# Read the lines with nucleotide sequences
bb <- b[c(9:19762),]

# Some diagnostics
# head(bb)
# tail(bb)
# length(bb)
# typeof(bb)
# is.vector(bb)

# Split lines
d <-  strsplit(bb, split = "")

# Some diagnostics
# head(d)
# tail(d)
# length(d)
# typeof(d)
# is.vector(d)

# Count number of variables ( http://stackoverflow.com/a/15201478/1009306 )
max.length <- max(sapply(d, length))

# Add NA values to list elements when the lists are shorter than others
d <- lapply(d, function(x) {c(x, rep(NA, max.length-length(x)))})

# Combine all elements
do.call(rbind, d)

# Some diagnostics
# head(d)
# tail(d)
# length(d)
# typeof(d)
# is.vector(d)


# Transform matrix
dd <- t(matrix(unlist(d),ncol=length(d)))

# Some diagnostics
# head(dd)
# tail(dd)
# is.matrix(dd)

# Transform existing dd matrix into ddd data frame
ddd <- as.data.frame(dd)

# Some diagnostics
# head(ddd)
# tail(ddd)
# is.data.frame(ddd)
# typeof(ddd)
# length(ddd)
# class(ddd)
# str(ddd)
# names(ddd)
# nrow(ddd)
# ncol(ddd)
# summary(ddd)

# Add new variable allel by concatenating values in existing variables V1...v19
ddd <- transform(ddd, allel = paste0(ddd$V1, ddd$V2, ddd$V3, ddd$V4, ddd$V5, ddd$V6, ddd$V7, ddd$V8, ddd$V9, ddd$V10, ddd$V11, ddd$V12, ddd$V13, ddd$V14, ddd$V15, ddd$V16, ddd$V17, ddd$V18, ddd$V19, sep = " "))

# Some diagnostics
# names(ddd)

# Reorder variable allel to be the first
new_ordered <- ddd[c(length(ddd),c(1:(length(ddd)-1)))]

# Some diagnostics
# names(new_ordered)
# ncol(new_ordered)

# Remove unnecessary variables V1...V19
new_ordered$V1 <- NULL
new_ordered$V2 <- NULL
new_ordered$V3 <- NULL
new_ordered$V4 <- NULL
new_ordered$V5 <- NULL
new_ordered$V6 <- NULL
new_ordered$V7 <- NULL
new_ordered$V8 <- NULL
new_ordered$V9 <- NULL
new_ordered$V10 <- NULL
new_ordered$V11 <- NULL
new_ordered$V12 <- NULL
new_ordered$V13 <- NULL
new_ordered$V14 <- NULL
new_ordered$V15 <- NULL
new_ordered$V16 <- NULL
new_ordered$V17 <- NULL
new_ordered$V18 <- NULL
new_ordered$V19 <- NULL

# Some diagnostics
# ncol(new_ordered)
# nrow(new_ordered)

# Remove rows containing NA ( http://stackoverflow.com/q/8005154/1009306 )
new_ordered <- subset(new_ordered, !(V50 == "NA" & V100 == "NA"))

# Some diagnostics
# head(new_ordered)
# ncol(new_ordered)
# nrow(new_ordered)


# Shrink whitespaces in allel names with the help of library(stringr)'s function:
new_ordered$allel <- gsub(" ", "", new_ordered$allel)




# The list of unique allels accordingly to LL*NN:NN(NL) template
#####

# Sort new_ordered data frame in an ascending order by allel variable
new_odrd_srtd <- new_ordered[order(new_ordered$allel),]

# Some diagnostics
# head(new_odrd_srtd)
# typeof(new_odrd_srtd)
# is.data.frame(new_odrd_srtd)

# The list of unique allel names
unique.allels <- unique(new_odrd_srtd$allel)

# Let the list to be a character vector
unique.allels <- as.character(unique.allels)

# Show them:
# unique.allels

# Their number is:
# length(unique.allels)

# Export them into MS Excel workbook:
# write.xlsx(unique.allels, file="d:/hla.xlsx", sheetName="01 unique.allels", append=TRUE)

# Extract the part of an allel name considering specific HLA protein only: LL*NN:NN(NL).
# The final point for the pattern of interest is cleared at http://r.789695.n4.nabble.com/Extract-part-of-string-tp4683108p4683111.html
specific.HLA.protein <- unique(gsub("^.*(\\A\\*[0-9A-Za-z]*\\:[0-9A-Za-z]*).*$", "\\1", unique.allels))

# Show them:
# specific.HLA.protein

# Their number is:
# length(specific.HLA.protein)

# Export  them into _the same_ MS Excel workbook
# write.xlsx(specific.HLA.protein, file="d:/hla.xlsx", sheetName="02 specific.HLA.protein", append=TRUE)













##################################################################################
# Plan
#
# convert multiple rows per subject into single row
# Create data frame with these long rows
# Concatenate values of each variable into corresponding single cells of a new row
#
#
# Example for http://stackoverflow.com/q/42711357
#####

# data frame A
IDA <- c("a", "a", "a")
Var1 <- c("1","4",".")
Var2 <- c("2"," ","8")
Var3 <- c("3","6","9")

# data frame B
IDB <- c("b", "b")
Var4 <- c("11","44")
Var5 <- c("22"," ")
Var6 <- c("33","66")

# Create data frames and check their structures    
dfA <- data.frame(IDA, Var1, Var2, Var3)
is.data.frame(dfA)
dfB <- data.frame(IDB, Var4, Var5, Var6)
is.data.frame(dfB)

# Create a list of data frames
from <- list(dfA, dfB)
from

# Check its type
is.list(from)

# Read each elements of the list one by one
from[[1]] 
from[[2]]

# Arrange only any single element of the list to get the desired structure:
trnsp.dfA <- t(c(t(from[[1]])))
trnsp.dfA
trnsp.dfB <- t(c(t(from[[2]])))
trnsp.dfB


l2 <- lapply(from, function(i) t(c(t(i))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))

new_df <- setNames(data.frame(do.call(rbind, l2)), c('ID', paste0('Var', seq(max(lengths(l2))-1))))
new_df


# Some diagnostics
diagnostic <- new_df
head(diagnostic)
tail(diagnostic)
is.data.frame(diagnostic)
typeof(diagnostic)
length(diagnostic)
class(diagnostic)
str(diagnostic)
names(diagnostic)
nrow(diagnostic)
ncol(diagnostic)
summary(diagnostic)


##################################################################################
# End of Example

# Select strings only for A*01:01:01:01 allel
new_odrd_srtd_sbst <- subset(new_odrd_srtd, grepl("A\\*01:01:01*\\:*[0-9A-Za-z]", allel) )
# A regular expression for the pattern with spaces plus extra info:
# new_odrd_srtd_sbst <- subset(new_odrd_srtd, grepl("^.*(\\A\\*[0-9A-Za-z]*\\:0[1-2]).*$", allel) )
head(new_odrd_srtd_sbst)

unique(new_odrd_srtd_sbst$allel)




# Add new vaiable allelGroup_specific.HLA.protein by copying values in existing variable allel
new_odrd_srtd_sbst <- transform(new_odrd_srtd_sbst, allelGroup_specific.HLA.protein = paste0(new_odrd_srtd_sbst$allel))

# Reorder variables
new_odrd_srtd_sbst_added_ordrd <- new_odrd_srtd_sbst[c(length(new_odrd_srtd_sbst), c(1:(length(new_odrd_srtd_sbst)-1)))]

# Extract the part of an allel name considering specific HLA protein only: A*NN:NN(NL).
# The final point for the pattern of interest is cleared here: http://r.789695.n4.nabble.com/Extract-part-of-string-tp4683108p4683111.html
new_odrd_srtd_sbst_added_ordrd$allelGroup_specific.HLA.protein <- gsub("^.*(\\A\\*[0-9A-Za-z]*\\:[0-9A-Za-z]*).*$", "\\1", new_odrd_srtd_sbst_added_ordrd$allelGroup_specific.HLA.protein)

# Diagnostic
is.data.frame(new_odrd_srtd_sbst_added_ordrd)
typeof(new_odrd_srtd_sbst_added_ordrd)


# Split dataframe into a list of data frames based on a value in allel variable
# http://stackoverflow.com/q/18527051
ndf <- split(new_odrd_srtd_sbst_added_ordrd, new_odrd_srtd_sbst_added_ordrd$allel)
ndf[[1]][1:36,1:25]

# Diagnostic
is.data.frame(ndf)
typeof(ndf)
class(ndf)
length(ndf)

# From this step I fail to step further...
abc
  • 167
  • 1
  • 4
  • 18
  • Can you please make your example reproducible? (Use `dput()`) – Sotos Mar 10 '17 at 09:31
  • I did. Thank you for your suggestion. – abc Mar 10 '17 at 11:08
  • 1
    So what is your desired output? You want a list back with one row dataframes or a single dataframe with a row per dataframe? And most importantly, why would you need such a strange data structure? – David Arenburg Mar 13 '17 at 21:33
  • Please add the expected output for given example. – Ronak Shah Mar 14 '17 at 05:59
  • 1
    Despite the multiple edits, it is not clear to me what the requirements of this question are. "if i understand correctly, your code ..." why your? who is the write talking to? "Here I can't reproduce this issue ..." what issue? And working our way back to the start, are you just looking for a way to output each element of Dfa and Dfb (dataframes) without specifying them all? And what is the desired output you are looking for particularly with the final section where you show two "single" examples? Again, are you just looking to do this for all of the indicies without listing them all? – TMWP Mar 17 '17 at 01:20
  • @Sotos, d.b, Sathish: all of your answers are super. But my 'ID' variables (in my real example I have two ID fields) repeat any time when a new line from a reorganizing data frame is arranged to the desired single row of a new data frame. I've been trying to solve this problem for 24+ hours by myself but failed. Could please fix this bug. I hate my coding. – abc Mar 19 '17 at 14:00

3 Answers3

3

Here is one possibility,

l2 <- lapply(from, function(i) as.vector(c(as.character(i[1,1]), t(c(t(i[-1]))))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))

new_df <- setNames(data.frame(do.call(rbind, l2)), 
                     c('ID', paste0('Var', seq(max(lengths(l2))-1))))

new_df
#  ID Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1  a    1    2    3    4         6    .    8    9
#2  b   11   22   33   44        66 <NA> <NA> <NA>

You could of course avoid the concatenation with i[1,1] which is not in your requirements but rather an addition of mine I thought It could apply here. So, by avoiding this and keeping your original transpose function, you get

l2 <- lapply(from, function(i) t(c(t(i))))
l2 <- lapply(l2, `length<-`, max(lengths(l2)))

new_df <- setNames(data.frame(do.call(rbind, l2)), 
                    c('ID', paste0('Var', seq(max(lengths(l2))-1))))

new_df
#  ID Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 Var11
#1  a    1    2    3    a    4         6    a    .     8     9
#2  b   11   22   33    b   44        66 <NA> <NA>  <NA>  <NA>

Try this in three steps.

First create your data frame without the IDs,

l3 <- lapply(from, function(i) t(c(t(i[-1]))))
l3 <- lapply(l3, `length<-`, max(lengths(l3)))

 new_df1 <- setNames(data.frame(do.call(rbind, l3)), 
                     paste0('Var', seq(max(lengths(l3)))))

new_df1
#  Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1  1    2    3    4         6    .    8    9
#2 11   22   33   44        66 <NA> <NA> <NA>

Extract all the unique IDs,

i1 <- sapply(from, function(i) unique(as.character(i[[1]])))
i1
#[1] "a" "b"

Bind them together,

final_df1 <- cbind(IDs = i1, new_df1)

final_df1
#  IDs Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
#1   a  1    2    3    4         6    .    8    9
#2   b 11   22   33   44        66 <NA> <NA> <NA>
Sotos
  • 51,121
  • 6
  • 32
  • 66
  • your works perfect in the example list :), but an error `Error in i[1, 1] : incorrect number of dimensions` appears with real data. Any hint? Thank you – abc Mar 11 '17 at 16:28
  • maybe with `unique`?, i.e. `lapply(from, function(i) as.vector(c(unique(as.character(i[,1])), t(c(t(i[-1]))))))` – Sotos Mar 11 '17 at 16:52
  • hmm, it doesn't work either. Sotos, could you please explain step by step what does `function(i) as.vector(c(as.character(i[1,1]), t(c(t(i[-1])))))` do? – abc Mar 11 '17 at 17:08
  • This data.frames, they are stored in your drive or they are objects in R with a pattern in the name? – Mario GS Mar 14 '17 at 00:21
  • @Sotos, your 'new_df1' with my real data set return a list with IDs... When I check my real list with `typeof()` function it returns `[1] "list"`. – abc Mar 19 '17 at 16:29
3

Following your example:

library(data.table)
# Create a list of data frames
from <- list(dfA, dfB)
from
[[1]]
  IDA Var1 Var2 Var3
1   a    1    2    3
2   a    4         6
3   a    .    8    9

[[2]]
  IDB Var4 Var5 Var6
1   b   11   22   33
2   b   44        

# rbind all the elements in the list of data.tables
    out <- lapply(from, function(x){as.data.table(t(c(t(x))))} )
    out <- rbindlist(out, fill =  TRUE)
    out
       V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
    1:  a  1  2  3  a  4     6  a   .   8   9
    2:  b 11 22 33  b 44    66 NA  NA  NA  NA

# If the files are stored on your drive, you can call them by bulk, and then `rbindlist`:

    files <- list.files(pattern = ".csv")
    files <- lapply(files, fread)
Mario GS
  • 859
  • 8
  • 22
  • thank you for your answer. But when I run the `library(data.table)` an `Error in library(data.table) : there is no package called ‘data.table’` occurs, when I run `install.packages(data.table)` another one `Error in install.packages : object 'data.table' not found` appears. – abc Mar 19 '17 at 14:02
  • 1
    I don't know which version or R, are you using, you can update your answer and post it with `R.Version()$version.string`. But in general when you install a package you need to put quotes: `install.packages("data.table", dependencies = T)`, try it please. – Mario GS Mar 19 '17 at 20:26
  • I tried. Thanks for the ""-siggestion. Yet it worked, but still as in other examples I see my IDs many times in the long line several times. – abc Mar 19 '17 at 20:38
  • @stan, I'm sorry, your question is so long that I get lost. Can you point where your real data is? – Mario GS Mar 20 '17 at 11:02
  • I don't think I'm getting what you need correctly. For instance, 1: a 1 2 3 a 4 6 a . 8 9, in this row, the id `a`, is repeated two times. For what I'm getting you only need it one time? For instance: 1: a 1 2 3 4 6 . 8 9? – Mario GS Mar 20 '17 at 11:08
1

I feel like you can just use lapply to iterate over all the data.frame in the list to do what you are already doing on each individual data.frame. Just make sure you subset each vector in such a way that the number of columns in the output is equal to the number of elements in the data.frame with the maximum number of elements. This maximum number (max_length in this example) can be obtained by unlisting each data.frame, obtaining the number of elements using lengths, and then using max to get the number of maximum elements.

max_length = max(lengths(lapply(from, unlist)))
do.call(rbind, lapply(from, function(df)
    t(c(t(df)))[1:max_length]))
#     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
#[1,] "a"  "1"  "2"  "3"  "a"  "4"  " "  "6"  "a"  "."   "8"   "9"  
#[2,] "b"  "11" "22" "33" "b"  "44" " "  "66" NA   NA    NA    NA

UPDATE

do.call(rbind, lapply(from, function(df)
     c(as.character(df[1,1]), t(c(t(df[,-1]))))[1:max_length]))
#     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
#[1,] "a"  "1"  "2"  "3"  "4"  " "  "6"  "."  "8"  "9"   NA    NA   
#[2,] "b"  "11" "22" "33" "44" " "  "66" NA   NA   NA    NA    NA   
d.b
  • 32,245
  • 6
  • 36
  • 77
  • absolutely yes, but I want my IDA and IDB to occur only once in the rows (at their beginning). This code really reaarange many lines into one, but each ID unfortunately repeat many times in new line. How to correct the code? – abc Mar 19 '17 at 14:08