0

This was my original code, very messy and untidy: Link to my previous question

library(dplyr); library(plyr)
library(magrittr); library(stringr) 
library(ExclusionTable)
library(lubridate)
library(tidyverse); library(tidyr)
library(janitor)
library(survival)
library(ggsurvfit); library(gtsummary)
library(zoo)
library(tidycmprsk)

# AA cohort (2 of 3)
## as

i=1
num_fu = c(1,2,3,4,5,6,7,8,9)
as <- data.frame()
df <- data.frame()
dfs <- data.frame()
data_dir <- 'C:/Users/thepr/Documents/data/as'

assign(paste0("flnames", i), list.files(path = paste0(data_dir, i), pattern = "\\.csv", full.names = TRUE))
assign(paste0("as", i, "_list"), lapply(get(paste0("flnames", i)),
                                        function(x){base::as.data.frame(read.csv(x))}))
nm <- gsub(".csv", "", basename(eval(parse(text = paste0("flnames", i))))) %>% str_sub(., 1,6)
assign(paste0("as", i, "_list"), setNames(get(paste0("as", i, "_list")), nm))
df <- Reduce(full_join, get(paste0("as", i, "_list")))
assign(paste0("as",i), df[!duplicated(base::as.list(df))])
dfs <- df

for (i in 2:length(num_fu)){
RID_common <- as1$RID %in% get(paste0("as", i))$RID

      assign(paste0("flnames", i), list.files(path = paste0(data_dir, i), pattern = "\\.csv", full.names = TRUE))
      assign(paste0("as", i, "_list"), lapply(get(paste0("flnames", i)),
                                              function(x){base::as.data.frame(read.csv(x))}))
      nm <- gsub(".csv", "", basename(eval(parse(text = paste0("flnames", i))))) %>% str_sub(., 1,6)
      assign(paste0("as", i, "_list"), setNames(get(paste0("as", i, "_list")), nm))
      df <- Reduce(full_join, get(paste0("as", i, "_list")))
      assign(paste0("as",i), df[!duplicated(base::as.list(df))])
      
      dfs <- merge(dfs, df, by = "RID", all.x = TRUE)
      dfs <- dfs[!duplicated(base::as.list(dfs))]
            if(paste0("AS", i, "_AREA") %in% colnames(get(paste0("as", i)))){
              assign(paste0("fu_",i-1), get(paste0("as", i))[RID_common, c("RID", paste0("AS", i, "_AREA"))])
              assign(paste0("fu_loss_",i-1), get(paste0("as", i))[!RID_common, c("RID", paste0("AS", i, "_AREA"))])
            # FU rate
              assign(paste0("fu_rate_", i-1), nrow(get(paste0("as", i)))/nrow(as1))
            }
            else if(paste0("AS", i, "_DATA_CLASS") %in% colnames(get(paste0("as", i)))){
              assign(paste0("fu_",i-1), get(paste0("as", i))[RID_common, c("RID", paste0("AS", i, "_DATA_CLASS"))])
              assign(paste0("fu_loss_",i-1), get(paste0("as", i))[!RID_common, c("RID", paste0("AS", i, "_DATA_CLASS"))])
            # FU rate
              assign(paste0("fu_rate_", i-1), nrow(get(paste0("as", i)))/nrow(as1))
            }
            else{}
}

After helpful comments by @Gregor Thomas @joran, I read previous posts and finally started using lists and vectors. Here is what I have tried so far:

library(tidyverse) #Includes: dplyr, stringr, tidyr
library(magrittr)
library(lubridate)
library(ExclusionTable)
library(janitor)
library(survival)
library(ggsurvfit); library(gtsummary)
library(zoo)
library(tidycmprsk)

# AA cohort (2 of 3)
## as
i=1
data_dir = c("C:/Users/thepr/Documents/data/as")
num_fu = c(1,2,3,4,5,6,7,8,9)
dirs <- paste0(data_dir, num_fu) # character
as <- data.frame()
df <- data.frame()
dfs <- data.frame()

flnames <- list.files(path = dirs, pattern = "\\.csv", full.names = TRUE)
as_list[[num_fu]] <- lapply(flnames[[num_fu]],
       function(x){base::as.data.frame(read.csv(x))})
names(as_list) <- gsub(".csv", "", basename(flnames[[num_fu]])) %>% str_sub(., 1,6)
df <- Reduce(full_join, as_list)
df <- df[!duplicated(base::as.list(df))]

somehow I keep getting error messages: Error in flnames[[num_fu]] : attempt to select more than one element in vectorIndex

Based on How do I make a list of data frames?, I think I am headed in the right direction. Please give some insights and thougths. Will be appreciated, thanks.

HJ WHY
  • 23
  • 8
  • 1
    The code `function(x){base::as.data.frame(read.csv(x))}` is a lot more convoluted than necessary: (1) you don't need the `{}` since the function body is a single expression. (2) you almost certainly don't need `base::`: there should be nothing overriding `as.data.frame`. (3) You don't need `as.data.frame`, since `read.csv` already returns a data frame. (4) You don't need to wrap `read.csv` into a function. — Taken together, you can replace that entire code snippet with just `read.csv`. – Konrad Rudolph Jun 06 '23 at 07:29
  • @Konrad Rudolph thanks. how do you recommend making `dir` as a list? For example `dir[[1]]` is paste0("data_dir", num_fu[[1]]) ? and etc... – HJ WHY Jun 06 '23 at 07:40
  • `num_fu` is the index after the folder name: `as1`, `as2`, `as3`, ... and I want to keep flnames in a list, with names using `num_fu` – HJ WHY Jun 06 '23 at 07:47
  • Sorry, I don't really understand these comments. However, see the update to my answer, maybe that helps. – Konrad Rudolph Jun 06 '23 at 08:07

1 Answers1

1

[[ always selects exactly one element. This is what the error message is trying to tell you.

What you want to do (based on your num_fu value) is to slice the list to extract a subset. You need to use [ to do that.

Furthermore, remove the indexing in the assignment target. That is, write:

as_list = lapply(flnames[num_fu], read.csv)

However, it's unclear that you actually want the indexing here at all, since it does not correspond to anything that I understand. Don't you simply want the following instead?

as_list = lapply(flnames[num_fu], read.csv)

Alternatively, if you want the entire thing in a nested list, where each subdirectory has its own sub-list, there are several solutions. One would be the following (note the nested invocation of lapply):

flnames = lapply(dirs, list.files, pattern = "\\.csv", full.names = TRUE)
as_list = lapply(
  flnames,
  function (subdir) {
    setNames(
      lapply(subdir, read.csv),
      sub("\\.csv$", "", basename(subdir)) %>% str_sub(., 1,6)
    )
  }
)

etc.

However, I would instead recommend keeping the structure flat. And, especially if you want to subsequently merge the entire thing into one single data.frame, you can drastically simplify the entire code as follows:

data_dir = "C:/Users/thepr/Documents/data/as"
num_fu = 1 : 9
dirs = paste0(data_dir, num_fu)
files = list.files(dirs, pattern = "\\.csv$", full.names = TRUE)


df = purrr::map_dfr(files, read.csv, .id = "Filename") %>%
  mutate(Filename = str_sub(sub("\\.csv$", "", basename(Filename)), 1, 6))

(Incidentally, avoid mixing = and <- for assignment, it makes the code messy; you can use either one, just don’t mix them.)

Konrad Rudolph
  • 530,221
  • 131
  • 937
  • 1,214
  • I have tried your latest code, but this appears: `In file(file, "rt") : cannot open file 'C:/Users/thepr/Documents/data/as1': Permission denied`. I think the reason is that `dirs` contain only path to a folder, not individual files. – HJ WHY Jun 06 '23 at 08:28
  • On the other hand, `flnames = lapply(dirs, list.files, pattern = "\\.csv", full.names = TRUE)` works perfectly, as I wanted flnames to be lists of 9 folders, each having their files as nested lists – HJ WHY Jun 06 '23 at 08:32
  • Oops, yes, I had forgotten the `list.files` function call in that code. I added it now. – Konrad Rudolph Jun 06 '23 at 08:55