0

I want to append by columns a matrix to an existing file without binding the matrices previously in R. This is an example:

A <- data.table(matrix(1:9, 3))
names(A) <- as.character(1:3)

B <- data.table(matrix(10:18, 3))
names(B) <- as.character(4:6)

fwrite(A, file = "test.txt", sep = " ", row.names = FALSE, col.names = FALSE)

fwrite(B, file = "test.txt", sep = " ", row.names = FALSE, col.names = FALSE, append = T)

I tried to change column names without sucess. The result is the following:

> fread("test.txt")
   V1 V2 V3
1:  1  4  7
2:  2  5  8
3:  3  6  9
4: 10 13 16
5: 11 14 17
6: 12 15 18

And this is what I am looking for:

1 4 7 10 13 16
2 5 8 11 14 17
3 6 9 12 15 18

I know that in my example I could simply run AB <- cbind(A, B) and than just fwrite(AB), but in practice I cannot do that given that A and B are extremely large matrices and I don't have enough memory to allocate a combined matrix.

Notice that this may not be doable with fwrite() so I am open to other methods.


Edit
I found a temporary solution by transposing the matrices :

A <- data.table(t(matrix(1:9, 3)))
B <- data.table(t(matrix(10:18, 3)))

fwrite(A, file = "test.txt", sep = " ", row.names = FALSE, col.names = FALSE)
fwrite(B, file = "test.txt", sep = " ", row.names = FALSE, col.names = FALSE, append = T)

> t(fread("test.txt"))
   [,1] [,2] [,3] [,4] [,5] [,6]
V1    1    4    7   10   13   16
V2    2    5    8   11   14   17
V3    3    6    9   12   15   18

This solution is not ideal, so I am still looking forward if someone comes up with something better.

Cœur
  • 37,241
  • 25
  • 195
  • 267
mat
  • 2,412
  • 5
  • 31
  • 69
  • There is no fast solution, anything you do will be expensive. Think about what is happening on the disk, when you add new characters before the end of a line, you have to shift (i.e., read and re-write) all the characters that come after that line. Possible solutions: 1) use a different storage format (e.g., SQL, fst), 2) read in line by line and write to a new file -- this will be slow, but at least won't take up a lot of memory. – thc Oct 26 '18 at 03:58

1 Answers1

2

You could try SQL way.

Data

library(data.table)

A <- data.table(matrix(1:9, 3))
names(A) <- paste0("col", as.character(1:3))
A$id <- row.names(A)

B <- data.table(matrix(10:18, 3))
names(B) <- paste0("col", as.character(4:6))
B$id <- row.names(B)

fwrite(A, file = "A.txt", sep = " ", row.names = FALSE, col.names = TRUE)
fwrite(B, file = "B.txt", sep = " ", row.names = FALSE, col.names = TRUE)

Using RSQLite

library(RSQLite)
db <- dbConnect(SQLite(), dbname = "Test.sqlite")
dbWriteTable(conn = db, name = "tab1", value = "A.txt", row.names = FALSE, header = TRUE, sep = " ")
dbWriteTable(conn = db, name = "tab2", value = "B.txt", row.names = FALSE, header = TRUE, sep = " ")

# Retreive columns in a table (excluding id)
col1 <- dbListFields(db, "tab1")         
col2 <- dbListFields(db, "tab2")         
col1 <- col1[!col1 %in% "id"]
col2 <- col2[!col2 %in% "id"]

#Append Columns using Join by the id (row names) created
sql_q <- paste0('CREATE TABLE tab3 AS SELECT ', 
                 paste('t1.', col1, collapse = ", ", sep = ""), ', ', 
                 paste('t2.', col2, collapse = ", ", sep = ""), ' FROM tab1 t1 INNER JOIN tab2 t2 ON t1.id = t2.id')

> sql_q
[1] "CREATE TABLE tab3 AS SELECT t1.col1, t1.col2, t1.col3, t2.col4, t2.col5, t2.col6 FROM tab1 t1 INNER JOIN tab2 t2 ON t1.id = t2.id"

dbSendQuery(conn = db, statement = sql_q)
dbGetQuery(db, 'SELECT * FROM tab3')

> dbGetQuery(db, 'SELECT * FROM tab3')
  col1 col2 col3 col4 col5 col6
1    1    4    7   10   13   16
2    2    5    8   11   14   17
3    3    6    9   12   15   18
MKa
  • 2,248
  • 16
  • 22