Run a R script for all files in a directory, and store the outputs in one common data frame

Question

I have a script that works fine for one file. It takes the information from a json file, extracts a list and a sublist of it (A), and then another list B with the third element of list A. It creates a data frame with list B and compares it with a master file. Finally, it provides two numbers: the number of elements in the list B and the number of matching elements of that list when comparing with the master file.

However, I have 180 different json files in a folder and I need to run the script for all of them, and build a data frame with the results for each file. So the final result should be something like this (note that the last line's figures are correct, the first two are fictitious):

The code I have so far is the following:

library(rjson)
library(dplyr)
library(tidyverse)

        #load data from file
file <- "./raw_data/whf.json"
json_data <- fromJSON(file = file)
org_name <- json_data$id

        # extract lists and the sublist
usernames <- json_data$twitter   
following <- usernames$following 

        # create empty vector to populate
longitud = length(following)
names <- vector(length = longitud)

        # loop to populate the empty vector with third element of the sub-list
for(i in 1:longitud){
    names[i] <- following[[i]][3]
}

        # create a data frame and change column name
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"

        # create a data frame with the correct formatting ready to comparison
org_handles <- data.frame(paste("@", names_list$usernames, sep=""))
colnames(org_handles) <- "Twitter"

        # load master file and select the needed columns
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
    select(Name, AKA, Twitter)

        # merge data frames and present the results
org_list <- inner_join(psa_handles, org_handles)
length(org_list$Twitter)
length(usernames$following)

My first attempt is to include this code at the beginning:

files <- list.files()
for(f in files){

json_data <- fromJSON(file = f)

# the rest of the script for one file here

}

but I do not know how to write the code for the data frame or even how to integrate both ideas -the working script and the loop for the file names. I took the idea from here.

The new code after Alvaro Morales' answer is the following

library(rjson)
library(dplyr)
library(tidyverse)

archivos <- list.files("./raw_data/")
calculate_accounts <- function(archivos){

        #load data from file
path <- paste("./raw_data/", archivos, sep = "")
json_data <- fromJSON(file = path)
org_name <- json_data$id

        # extract lists and the sublist
usernames <- json_data$twitter   
following <- usernames$following 

        # create empty vector to populate
longitud = length(following)
names <- vector(length = longitud)

        # loop to populate the empty vector with third element of the sub-list
for(i in 1:longitud){
    names[i] <- following[[i]][3]
}

        # create a data frame and change column name
names_list <- data.frame(sapply(names, c))
colnames(names_list) <- "usernames"

        # create a data frame with the correct formatting ready to comparison
org_handles <- data.frame(paste("@", names_list$usernames, sep=""))
colnames(org_handles) <- "Twitter"

        # load master file and select the needed columns
psa_handles <- read_csv(file = "./psa_handles.csv") %>%
    select(Name, AKA, Twitter)

        # merge data frames and present the results
org_list <- inner_join(psa_handles, org_handles)

accounts_db_org <- length(org_list$Twitter)
accounts_total_org <- length(usernames$following)
}

table_psa <- map_dfr(archivos, calculate_accounts)

However, now there is an error when Joining, by = "Twitter", it says subindex out of limits.

Links to 3 test files to put together in raw_data folder:

https://drive.google.com/file/d/1ilUHwLjgtZCzh0LneIJEhTryrGumDF1V/view?usp=sharing

https://drive.google.com/file/d/1KM3hRZ8DzgPMEsMFmwBdmMNHrPCttuaB/view?usp=sharing

https://drive.google.com/file/d/17cWXJ9ltGXZ6izkgJv0uyNwStrE95_OA/view?usp=sharing

Link to the master file to compare:

https://drive.google.com/file/d/11fOpYFFfHijhZl_CuWHKvkrI7edkpUNQ/view?usp=sharing

<<<<< UPDATE >>>>>>

I am trying to find the solution and I did the code work and provide a valide output (a 180x3 data frame), but the columns that should be filled with the values of the objects accounts_db_org and accounts_total_org are showing NA. When checking the value stored in those objects, the values are correct (for the last iteration). So the output now is in its right format, but with NA instead of numbers.

I am really close, but I am not being able to make the code to show the right numbers. My last attempt is:

library(rjson)
library(dplyr)
library(tidyverse)


archivos <- list.files("./raw_data", pattern = "json", full.names = TRUE)
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv", show_col_types = FALSE) %>%
  select(Name, AKA, Twitter)

nr_archivos <- length(archivos)
psa_result <- matrix(nrow = nr_archivos, ncol = 3)

# loop for working with all files, one by one
for(f in 1:nr_archivos){

# load file
    json_data <- fromJSON(file = archivos[f])
    org_name <- json_data$id

# extract lists and the sublist
    usernames <- json_data$twitter
    following <- usernames$following

# empty vector
    longitud = length(following)
    names <- vector(length = longitud)

# loop to populate with the third element of each i item of the sublist
    for(i in 1:longitud){
        names[i] <- following[[i]][3]
    }

# convert the list into a data frame
    names_list <- data.frame(sapply(names, c))
    colnames(names_list) <- "usernames"

# applying some format prior to comparison
    org_handles <- data.frame(paste("@", names_list$usernames, sep=""))
    colnames(org_handles) <- "Twitter"
                                        
# merge tables and calculate the results for each iteration
    org_list <- inner_join(psa_handles, org_handles)
    accounts_db_org <- length(org_list$Twitter)
    accounts_total_org <- length(usernames$following)

# populate the matrix row by row
psa_result[f] <- c(org_name, accounts_db_org, accounts_total_org)
}

# create a data frame from the matrix and save the result
psa_result <- data.frame(psa_result)
write_csv(psa_result, file = "./outputs/cuentas_seguidas_en_psa.csv")

The subscript out of bounds error was caused by a json file with 0 records. That was fixed deleting the file.

Alvaro Morales · Answer 1 · 2022-01-22T12:57:04.487

You can do it with purrr::map or purrr::map_dfr.

Is this what you looking for?

archivos <- list.files("./raw_data", pattern = "json", full.names = TRUE)

# load master file and select the needed columns. This needs to be out of "calculate_accounts" because you only read it once. 
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
  select(Name, AKA, Twitter)

# calculate accounts
calculate_accounts <- function(archivo){
  
   json_data <- rjson::fromJSON(file = archivo)
  
  org_handles <- json_data %>%
    pluck("twitter", "following") %>%
    map_chr("username") %>% 
    as_tibble() %>% 
    rename(usernames = value) %>% 
    mutate(Twitter = str_c("@", usernames)) %>% 
    select(Twitter)
  
  org_list <- inner_join(psa_handles, org_handles)
  
  org_list %>% 
    mutate(accounts_db_org = length(Twitter),
           accounts_total_org = nrow(org_handles)) %>% 
    select(-Twitter)
}

table_psa <- map_dfr(archivos, calculate_accounts)

#output:
# A tibble: 53 x 4
   Name                                                                       AKA   accounts_db_org accounts_total_org
   <chr>                                                                      <chr>           <int>              <int>
 1 Association of American Medical Colleges                                   AAMC               20               2924
 2 American College of Cardiology                                             ACC                20               2924
 3 American Heart Association                                                 AHA                20               2924
 4 British Association of Dermatologists                                      BAD                20               2924
 5 Canadian Psoriasis Network                                                 CPN                20               2924
 6 Canadian Skin Patient Alliance                                             CSPA               20               2924
 7 European Academy of Dermatology and Venereology                            EADV               20               2924
 8 European Society for Dermatological Research                               ESDR               20               2924
 9 US Department of Health and Human Service                                  HHS                20               2924
10 International Alliance of Dermatology Patients Organisations (Global Skin) IADPO              20               2924
# ... with 43 more rows

I updated the code and provided links to test files, just in case you would like to try. Thanks! — Javi, Jan 20 '22 at 17:38
Thanks for your help, @alvaro-morales, but the results are not what I was expecting. What I need is a data frame with the numbers my code was providing at the end, hence the final lines with `length()` function. I added a screenshot to make the question clearer (note that the last two figures (those related to WHF) are real and that is the expected result). Also, the code works for the first dozen of files, and then stops with an `Error in following[[i]] : subindex out of limits`, but I guess this could be an issue with one of the files I should face later. — Javi, Jan 21 '22 at 08:50
I understand. I've been analyzing the code and the structure of the data and I will redo the code to make it simpler. — Alvaro Morales, Jan 21 '22 at 15:37
Sorry Álvaro, but the code is not working as expected. Finally I found the solution I needed, please look at my answer. — Javi, Jan 22 '22 at 17:57

Javi · Accepted Answer · 2022-01-24T17:24:33.867

Unfortunately, the answer provided by Álvaro does not work as expected, since the output repeats the same number with different organisation names, making it really difficult to read. Actually, the number 20 is repeated 20 times, the number 11, 11 times, and so on. The information is there, but it is not accessible without further data treatment.

I was doing my own research in the meantime and I got to the following code. Finally I made it to work, but the data format was "matrix" "array", really confusing. Fortunately, I wrote the last lines to transpose the data, unlist the array and convert in a matrix, which is able to be converted in a data frame and manipulated as usual.

Maybe my explanation is not very useful, and since I am a newbie, I am sure the code is far from being elegant and optimised. Anyway, please review the code below:

library(purrr)
library(rjson)
library(dplyr)
library(tidyverse)
setwd("~/documentos/varios/proyectos/programacion/R/psa_twitter")

                                        # Load data from files.
archivos <- list.files("./raw_data/json_files",
                       pattern = ".json",
                       full.names = TRUE)
psa_handles <- read_csv(file = "./raw_data/psa_handles.csv") %>%
    select(Name, AKA, Twitter)

nr_archivos <- length(archivos)

calcula_cuentas <- function(a){
                                        # Extract lists
    json_data <- fromJSON(file = a)
    org_aka <- json_data$id
    org_meta <- json_data$metadata
    org_name <- org_meta$company

    twitter <- json_data$twitter
    following <- twitter$following
                                    # create an empty vector to populate
    longitud = length(following)
    names <- vector(length = longitud)
    
# loop to populate the empty vector with third element of the sub-list
    for(i in 1:longitud){
        names[i] <- following[[i]][3]
    }
                            # create a data frame and change column name
    names_list <- data.frame(sapply(names, c))
    colnames(names_list) <- "usernames"

# Create a data frame with the correct formatting ready to comparison
    org_handles <- data.frame(paste("@",
                                    names_list$usernames,
                                    sep="")
                              )
    colnames(org_handles) <- "Twitter"
    
                                        # merge tables
    org_list <- inner_join(psa_handles, org_handles)
    cuentas_db_org <- length(org_list$Twitter)
    cuentas_total_org <- length(twitter$following)
    results <- data.frame(Name = org_name,
                            AKA = org_aka,
                            Cuentas_db = cuentas_db_org,
                            Total = cuentas_total_org)
    results
}

        # apply function to list of files and unlist the result
psa <- sapply(archivos, calcula_cuentas)
psa1 <- t(as.data.frame(psa))
psa2 <- matrix(unlist(psa1), ncol = 4) %>%
    as.data.frame()
colnames(psa2) <- c("Name", "AKA", "tw_int_outbound", "tw_ext_outbound")

         # Save the results.
saveRDS(psa2, file = "rda/psa.RDS")

Run a R script for all files in a directory, and store the outputs in one common data frame

2 Answers2