1

I have a folder that is full of CSV files. Each file has genes as row names and samples as columns. I need them as TSV, so I saved each one of them as TSV and into a new path. This is the code:

folder_path <- "/original/folder/path"
csv_files <- list.files(path=folder_path, pattern="^tpm.*\\.csv$", full.names=TRUE)

new_TSV_files_path<- "/new/folder/path/"

lapply(csv_files , function(file) {
  data <- read.csv(file, stringsAsFactors=FALSE, fill=TRUE)
  
  filename <- basename(file)
  filename <- sub(".csv$", "", filename)
  
  new_file_path <- file.path(new_TSV_files_path , paste0(filename, ".txt"))
  
  write.table(data, 
              file=new_file_path, 
              sep="\t", 
              row.names=FALSE, 
              quote=FALSE)
})

The problem is after I save them as TSV, and I view them, the first column is the genes, named "X" and the row names is just numbers. I tired in many ways to keep the genes as the row names but it is not working. This is a code that I tried for example:

new_TSV_files_path <- "/tsv/files/path/"
file_list <- list.files(path=TSV_files_path , pattern="^tpm.*\\.tsv$", full.names=TRUE)

for (file_path in file_list) {
  dt <- fread(file_path, data.table=FALSE)
  rownames(dt) <- dt[, 1]
  
  dt <- dt[, -1]
  
  # Save the data back to the same tsv file
  fwrite(dt, file_path, sep="\t", row.names=TRUE)
}

I need the genes as row names and I also need to delete the X column (it is the first column) or else the web tool I'm using won't work. What can be done? Why is it that after the transition from CSV to TSV the row names become as a separate column ?

EDIT - a few line of one of the CSVs

c("\"\",\"Riaz_Nivolumab_2017-p001-ar-8813\",\"Riaz_Nivolumab_2017-p002-ar-8815\",\"Riaz_Nivolumab_2017-p017-ar-8890\",\"Riaz_Nivolumab_2017-p026-ar-8920\",\"Riaz_Nivolumab_2017-p034-ar-8929\",\"Riaz_Nivolumab_2017-p036-ar-8898\",\"Riaz_Nivolumab_2017-p037-ar-8900\",\"Riaz_Nivolumab_2017-p038-ar-8895\",\"Riaz_Nivolumab_2017-p039-ar-8819\",\"Riaz_Nivolumab_2017-p046-ar-8904\",\"Riaz_Nivolumab_2017-p047-ar-8836\",\"Riaz_Nivolumab_2017-p048-ar-8897\",\"Riaz_Nivolumab_2017-p049-ar-8824\",\"Riaz_Nivolumab_2017-p052-ar-8839\",\"Riaz_Nivolumab_2017-p065-ar-8857\",\"Riaz_Nivolumab_2017-p067-ar-8840\",\"Riaz_Nivolumab_2017-p078-ar-8864\",\"Riaz_Nivolumab_2017-p079-ar-8849\",\"Riaz_Nivolumab_2017-p082-ar-8822\",\"Riaz_Nivolumab_2017-p085-ar-8829\",\"Riaz_Nivolumab_2017-p090-ar-8866\",\"Riaz_Nivolumab_2017-p092-ar-8867\",\"Riaz_Nivolumab_2017-p098-ar-8853\",\"Riaz_Nivolumab_2017-p101-ar-8834\",\"Riaz_Nivolumab_2017-p103-ar-8872\",\"Riaz_Nivolumab_2017-p106-ar-8926\"", 
"\"A1BG\",1.0008835087938,-1.91493588495907,-0.620734484990514,-1.75775209550157,-1.23665217147917,-2.49624508855533,0.727023994594072,0.328722516597936,-0.108114351387649,-0.0836309292648272,-0.325627003998877,4.41269552131938,-0.229120400945704,3.7248031834164,-0.504990077186003,-2.28542936373545,-1.1031748886154,-0.0826211449622227,1.44674204021314,1.3682306825028,0.544742458853122,-2.21733388578132,0.231763591509675,0.0236879027615116,1.79591432017568,-0.638847949374409", 
"\"A1CF\",-0.543733010666938,-0.537513815032929,-0.481855746353474,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,5.49732418197924,-0.543733010666938,4.6852457132221,-0.543733010666938,0.117380356180772,-0.543733010666938,-0.543733010666938,1.53929625176185,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.48894973908573,-0.543733010666938,-0.543733010666938,-0.543733010666938", 
"\"A2M\",0.0979791979801679,-0.446028724661623,-0.465953763246008,-2.08487244739613,2.01411421717692,1.19837532976838,-0.745540508622708,0.356022989838996,-2.37641175531056,0.517622991895147,2.67515559703126,1.74369521852923,1.19154956592325,2.00377301045288,-0.130083699768657,-0.354493861811036,-3.736442059777,1.59429773965362,1.2081229778692,-2.95554013952561,-1.60581260283625,1.03219514758484,0.925657747621587,-2.02343920511453,-0.246282066712528,0.612339103457181", 
"\"A2ML1\",-0.363502635555151,-0.363502635555151,-0.298573188895895,-0.363502635555151,-0.363502635555151,-0.313253413224058,3.64328505792818,-0.363502635555151,-0.363502635555151,-0.363502635555151,0.0272101525164028,-0.363502635555151,-0.107062171894006,-0.363502635555151,-0.363502635555151,-0.363502635555151,-0.363502635555151,-0.212720613261966,-0.363502635555151,-0.363502635555151,3.46425626517263,-0.363502635555151,-0.197854673973831,-0.363502635555151,0.174257390070118,-0.363502635555151"
)
Programming Noob
  • 1,232
  • 3
  • 14
  • Care to provide a small sample from one of your csv-files? E.g. output from `readLines(csv_files[1])[1:3] |> dput()` ? I'd guess rownames are moved to column `X` when using `read.csv()` without setting `row.names` . And `wirte.table()` just stores what you have in a dataframe. You have also explicitly set `row.names = FALSE` in `write.table()`, meaning that names would be dropped if they were imported correctly. – margusl Jun 25 '23 at 12:59
  • @margusl `write.csv(., row names=FALSE)` will delete the row names, not a good idea. – jay.sf Jun 25 '23 at 15:41
  • @jay.sf, I'm just stating what's currently included in OP's code. – margusl Jun 25 '23 at 15:59
  • @margusl in `read.csv` I did set `row.names = 1` and in `write.table` I set `row.names = TRUE` and yet it yields the same result. – Programming Noob Jun 26 '23 at 09:31
  • @ProgrammingNoob, well , without having access to actual input data it's just a guesswork. To help others help you, perhaps consider including few bits from your actual input csv files and include output from `readLines(csv_files[1])[1:3] |> dput()` to your question. – margusl Jun 26 '23 at 09:55
  • 1
    @margusl I added the output of this line, see the edit to the post above. Thanks! – Programming Noob Jun 26 '23 at 11:10
  • @ProgrammingNoob Thx, but please just copy paste .csv from text editor, not after loading in R. – jay.sf Jun 26 '23 at 11:49

2 Answers2

1

Essentially you may want to set row.names=1 in read.csv to indicate which column contains the row names.

csv_files <- list.files(folder_path, full.names=TRUE)

lapply(csv_files, \(x) {
  read.csv(x, row.names=1) |>
    write.table(file=sprintf('%s/%s', new_tsv, sub('\\.csv$', '.txt', basename(x))), 
                sep='\t', quote=FALSE)
}) |> invisible()

The invisible avoids to clutter the console with NULLs.

Test:

read.table(sprintf('%s/%s', new_tsv, 'tpm_01.txt'))
#       sample1 sample2 sample3
# gene1      14       9      14
# gene2       8      14      17
# gene3      11       9       5
# gene4      12      16      14
# gene5      11       9      13

Data:

tdir <- tempdir()
folder_path <- paste0(tdir, '/old')
dir.create(folder_path)

set.seed(42)
for (i in 1:3) {
  write.csv(matrix(rpois(15, 10), 5, 3, dimnames=list(paste0('gene', 1:5), paste0('sample', 1:3))),
            sprintf('%s/tpm_%02d.csv', folder_path, i))
}

new_tsv <- paste0(tdir, '/new')
dir.create(new_tsv)
jay.sf
  • 60,139
  • 8
  • 53
  • 110
  • It didn't work. After saving the TSV file I View it just to check, like this: `View(fread("/data/my/path/for/file/FILE.txt", data.table = F))` and the result is just numbers as rownames, and the first column is called `V1`. My goal is to have the `V1` as the row names, and I don't want this column to be in my data frame.. – Programming Noob Jun 26 '23 at 09:43
  • @ProgrammingNoob please add an example of your .csv to your question, how it exactly looks in a text editor. Just a few rows and columns would be sufficient. – jay.sf Jun 26 '23 at 09:56
  • @ProgrammingNoob BTW, have you already tried `read.table("/data/my/path/for/file/FILE.txt")`? – jay.sf Jun 26 '23 at 10:11
  • @ProgrammingNoob Or `data.table::fread(file="/data/my/path/for/file/FILE.txt") |> as.matrix(rownames=1)`. It's an issues of `fread`, `data.table` doesn't care much about matrices, see [this post](https://stackoverflow.com/q/24425109/6574038). – jay.sf Jun 26 '23 at 10:19
  • Yes but the thing is, I am not planning to read those TSV files at all. All I'm going to do is use them in a web tool that asks that the genes would be as row names. When I `View` them using `fread` it is just for checking the rownames.. – Programming Noob Jun 26 '23 at 11:13
  • @ProgrammingNoob , how is that webtool detecting / parsing rownames? Number of defined columns being less than number of fields in next rows? First column with empty name (`""`)? A column with certain name (e.g. `"index"`)? Do you have an example file that is accepted by that tool? – margusl Jun 26 '23 at 11:28
  • You actually have genes as row names @ProgrammingNoob, don't understand why you care about `fread`. – jay.sf Jun 26 '23 at 11:48
  • I'd just add that `fread` handles rownames differently than `write.table()` / `read.table()`, and `freaad ` adding a `V1` column is expected behaviour for `write.table`-style files. It should also throw a warning: `Detected 26 column names but the data has 27 columns (i.e. invalid file). Added 1 extra default column name for the first column which is guessed to be row names or an index. Use setnames() afterwards if this guess is not correct, or fix the file write command that created the file to create a valid file.` – margusl Jun 26 '23 at 11:57
  • @margusl Actually `fread` is meant for `data.table`s and `data.frame`s and not for matrices as in our case, which is why it causes unnecessary trouble for us. This is actually the gist of the link I gave in my [comment above](https://stackoverflow.com/questions/76550373/saving-csv-files-as-tab-delimited-files-while-keeping-the-row-names/76551149?noredirect=1#comment134978775_76551149). – jay.sf Jun 26 '23 at 12:07
0

Check provided sample structure and prepare reprex:

# sample file content, truncated lines, rownames are in first column named "":
stringr::str_view(stringr::str_trunc(sample_lines, 80))
#> [1] │ "","Riaz_Nivolumab_2017-p001-ar-8813","Riaz_Nivolumab_2017-p002-ar-8815","Ria...
#> [2] │ "A1BG",1.0008835087938,-1.91493588495907,-0.620734484990514,-1.75775209550157...
#> [3] │ "A1CF",-0.543733010666938,-0.537513815032929,-0.481855746353474,-0.5437330106...
#> [4] │ "A2M",0.0979791979801679,-0.446028724661623,-0.465953763246008,-2.08487244739...
#> [5] │ "A2ML1",-0.363502635555151,-0.363502635555151,-0.298573188895895,-0.363502635...
writeLines(sample_lines, "tpm_sample.csv")

Modify original code to import row names (read.csv(..., row.names = 1)) and NOT to drop those on writing (remove row.names=FALSE from write.table() ):

folder_path <- "./"
csv_files <- list.files(path=folder_path, pattern="^tpm.*\\.csv$", full.names=TRUE)

new_TSV_files_path<- "./"
lapply(csv_files , function(file) {
  # enable reading rownames from 1st column:
  data <- read.csv(file, row.names = 1, stringsAsFactors=FALSE, fill=TRUE)
  
  filename <- basename(file)
  filename <- sub(".csv$", ".txt", filename)
  
  new_file_path <- file.path(new_TSV_files_path , filename)
  
  # do NOT diable rownmes with row.names=FALSE
  write.table(data, 
              file=new_file_path, 
              sep="\t",
              quote=FALSE
              )
})
#> [[1]]
#> NULL

Check resulting files:

tsv_files <- list.files(path=new_TSV_files_path, pattern="^tpm.*\\.txt$", full.names=TRUE)

# as long as read.table / write.table compatibe rownmes are expected, we are all set
# written tsv content, truncated lines:
stringr::str_view(stringr::str_trunc(readLines(tsv_files[1]), 80))
#> [1] │ Riaz_Nivolumab_2017.p001.ar.8813{\t}Riaz_Nivolumab_2017.p002.ar.8815{\t}Riaz_Nivolu...
#> [2] │ A1BG{\t}1.0008835087938{\t}-1.91493588495907{\t}-0.620734484990514{\t}-1.75775209550157{\t}-...
#> [3] │ A1CF{\t}-0.543733010666938{\t}-0.537513815032929{\t}-0.481855746353474{\t}-0.543733010666...
#> [4] │ A2M{\t}0.0979791979801679{\t}-0.446028724661623{\t}-0.465953763246008{\t}-2.0848724473961...
#> [5] │ A2ML1{\t}-0.363502635555151{\t}-0.363502635555151{\t}-0.298573188895895{\t}-0.36350263555...

# re-import with read.table(), check first columns and rownames attribute:
tsv_in <- read.table(tsv_files[1])
tsv_in[,1:2]
#>       Riaz_Nivolumab_2017.p001.ar.8813 Riaz_Nivolumab_2017.p002.ar.8815
#> A1BG                         1.0008835                       -1.9149359
#> A1CF                        -0.5437330                       -0.5375138
#> A2M                          0.0979792                       -0.4460287
#> A2ML1                       -0.3635026                       -0.3635026
rownames(tsv_in)
#> [1] "A1BG"  "A1CF"  "A2M"   "A2ML1"

# fread handles rownames differently and also throws a relevant warning, 
# rownmaes ending up in column "V1" is expected behaviour
dt_in <- data.table::fread(tsv_files[1])
#> Warning in data.table::fread(tsv_files[1]): Detected 26 column names but the
#> data has 27 columns (i.e. invalid file). Added 1 extra default column name for
#> the first column which is guessed to be row names or an index. Use setnames()
#> afterwards if this guess is not correct, or fix the file write command that
#> created the file to create a valid file.
dt_in[,1:3]
#>       V1 Riaz_Nivolumab_2017.p001.ar.8813 Riaz_Nivolumab_2017.p002.ar.8815
#> 1:  A1BG                        1.0008835                       -1.9149359
#> 2:  A1CF                       -0.5437330                       -0.5375138
#> 3:   A2M                        0.0979792                       -0.4460287
#> 4: A2ML1                       -0.3635026                       -0.3635026
rownames(dt_in)
#> [1] "1" "2" "3" "4"

If the issue remains, it has to do with mentioned webtool and it's format expectations.

Sample file lines:

sample_lines <- c("\"\",\"Riaz_Nivolumab_2017-p001-ar-8813\",\"Riaz_Nivolumab_2017-p002-ar-8815\",\"Riaz_Nivolumab_2017-p017-ar-8890\",\"Riaz_Nivolumab_2017-p026-ar-8920\",\"Riaz_Nivolumab_2017-p034-ar-8929\",\"Riaz_Nivolumab_2017-p036-ar-8898\",\"Riaz_Nivolumab_2017-p037-ar-8900\",\"Riaz_Nivolumab_2017-p038-ar-8895\",\"Riaz_Nivolumab_2017-p039-ar-8819\",\"Riaz_Nivolumab_2017-p046-ar-8904\",\"Riaz_Nivolumab_2017-p047-ar-8836\",\"Riaz_Nivolumab_2017-p048-ar-8897\",\"Riaz_Nivolumab_2017-p049-ar-8824\",\"Riaz_Nivolumab_2017-p052-ar-8839\",\"Riaz_Nivolumab_2017-p065-ar-8857\",\"Riaz_Nivolumab_2017-p067-ar-8840\",\"Riaz_Nivolumab_2017-p078-ar-8864\",\"Riaz_Nivolumab_2017-p079-ar-8849\",\"Riaz_Nivolumab_2017-p082-ar-8822\",\"Riaz_Nivolumab_2017-p085-ar-8829\",\"Riaz_Nivolumab_2017-p090-ar-8866\",\"Riaz_Nivolumab_2017-p092-ar-8867\",\"Riaz_Nivolumab_2017-p098-ar-8853\",\"Riaz_Nivolumab_2017-p101-ar-8834\",\"Riaz_Nivolumab_2017-p103-ar-8872\",\"Riaz_Nivolumab_2017-p106-ar-8926\"", 
  "\"A1BG\",1.0008835087938,-1.91493588495907,-0.620734484990514,-1.75775209550157,-1.23665217147917,-2.49624508855533,0.727023994594072,0.328722516597936,-0.108114351387649,-0.0836309292648272,-0.325627003998877,4.41269552131938,-0.229120400945704,3.7248031834164,-0.504990077186003,-2.28542936373545,-1.1031748886154,-0.0826211449622227,1.44674204021314,1.3682306825028,0.544742458853122,-2.21733388578132,0.231763591509675,0.0236879027615116,1.79591432017568,-0.638847949374409", 
  "\"A1CF\",-0.543733010666938,-0.537513815032929,-0.481855746353474,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.543733010666938,5.49732418197924,-0.543733010666938,4.6852457132221,-0.543733010666938,0.117380356180772,-0.543733010666938,-0.543733010666938,1.53929625176185,-0.543733010666938,-0.543733010666938,-0.543733010666938,-0.48894973908573,-0.543733010666938,-0.543733010666938,-0.543733010666938", 
  "\"A2M\",0.0979791979801679,-0.446028724661623,-0.465953763246008,-2.08487244739613,2.01411421717692,1.19837532976838,-0.745540508622708,0.356022989838996,-2.37641175531056,0.517622991895147,2.67515559703126,1.74369521852923,1.19154956592325,2.00377301045288,-0.130083699768657,-0.354493861811036,-3.736442059777,1.59429773965362,1.2081229778692,-2.95554013952561,-1.60581260283625,1.03219514758484,0.925657747621587,-2.02343920511453,-0.246282066712528,0.612339103457181", 
  "\"A2ML1\",-0.363502635555151,-0.363502635555151,-0.298573188895895,-0.363502635555151,-0.363502635555151,-0.313253413224058,3.64328505792818,-0.363502635555151,-0.363502635555151,-0.363502635555151,0.0272101525164028,-0.363502635555151,-0.107062171894006,-0.363502635555151,-0.363502635555151,-0.363502635555151,-0.363502635555151,-0.212720613261966,-0.363502635555151,-0.363502635555151,3.46425626517263,-0.363502635555151,-0.197854673973831,-0.363502635555151,0.174257390070118,-0.363502635555151"
)

Created on 2023-06-26 with reprex v2.0.2

margusl
  • 7,804
  • 2
  • 16
  • 20