1

I have a network matrix for each participant in separate excel sheets. However, these matrices are not formatted correctly as they are currently asymmetrical.

Can the additional rows and columns be removed to create symmetric matrices? I.e., here is how the data is right now:

df <- data.frame(
  Receiver = c("Receiver1", "Receiver2","Receiver3", "Receiver4",
              "Receiver5", "Receiver6","Receiver7", "Receiver8",
              "Receiver9", "Receiver10"),
  Sender1 = c(0,1,0,0,0,1,0,0,0,0),
  Sender2 = c(0,0,0,1,0,0,0,0,0,0),
  Sender3 = c(0,0,0,0,0,0,0,0,0,0),
  Sender4 = c(0,1,0,0,0,0,1,0,0,0),
  Sender5 = c(1,0,1,0,0,1,0,0,0,0),
  Sender6 = c(0,1,0,0,0,0,1,0,0,0),
  Sender7 = c(0,0,1,0,1,0,0,0,0,0),
  Sender8 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  Sender9 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  Sender10 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
)

This is what I am looking for:


df_clean <- data.frame(
  Receiver = c("Receiver1", "Receiver2","Receiver3", "Receiver4",
              "Receiver5", "Receiver6","Receiver7"),
  Sender1 = c(0,1,0,0,0,1,0),
  Sender2 = c(0,0,0,1,0,0,0),
  Sender3 = c(0,0,0,0,0,0,0),
  Sender4 = c(0,1,0,0,0,0,1),
  Sender5 = c(1,0,1,0,0,1,0),
  Sender6 = c(0,1,0,0,0,0,1),
  Sender7 = c(0,0,1,0,1,0,0))
  1. Is there a for loop I can create so that it does this for every participant in each separate excel sheet I import into R?

I have 200 participants to do this for, so I am looking for an automated method as opposed to doing this one by one.

Thank you in advance!

Paige Kemp
  • 13
  • 3

2 Answers2

1
df1 <- Filter(\(x)!all(is.na(x)), df)
subset(df1, Receiver %in%sub("Sender", "Receiver", names(df1)[-1]))

   Receiver Sender1 Sender2 Sender3 Sender4 Sender5 Sender6 Sender7
1 Receiver1       0       0       0       0       1       0       0
2 Receiver2       1       0       0       1       0       1       0
3 Receiver3       0       0       0       0       1       0       1
4 Receiver4       0       1       0       0       0       0       0
5 Receiver5       0       0       0       0       0       0       1
6 Receiver6       1       0       0       0       1       0       0
7 Receiver7       0       0       0       1       0       1       0
Onyambu
  • 67,392
  • 3
  • 24
  • 53
0

If I've understood correctly, there are many possible approaches to this problem (e.g. some options for removing 'all NA' columns here: Remove columns from dataframe where ALL values are NA).

One potential solution using functions from the dplyr package is:

library(dplyr)
df_clean <- df %>%
  select(where(~any(!is.na(.)))) %>%
  slice_head(n = ncol(.) - 1)

And for the loop, you could make a list of dataframes and cycle through them, e.g.

Example data:

df1 <- data.frame(
  Receiver = c("Receiver1", "Receiver2","Receiver3", "Receiver4",
               "Receiver5", "Receiver6","Receiver7", "Receiver8",
               "Receiver9", "Receiver10"),
  Sender1 = c(0,1,0,0,0,1,0,0,0,0),
  Sender2 = c(0,0,0,1,0,0,0,0,0,0),
  Sender3 = c(0,0,0,0,0,0,0,0,0,0),
  Sender4 = c(0,1,0,0,0,0,1,0,0,0),
  Sender5 = c(1,0,1,0,0,1,0,0,0,0),
  Sender6 = c(0,1,0,0,0,0,1,0,0,0),
  Sender7 = c(0,0,1,0,1,0,0,0,0,0),
  Sender8 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  Sender9 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  Sender10 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
)

# make df2 different to df1 (add more NAs)
df2 <- data.frame(
  Receiver = c("Receiver1", "Receiver2","Receiver3", "Receiver4",
               "Receiver5", "Receiver6","Receiver7", "Receiver8",
               "Receiver9", "Receiver10"),
  Sender1 = c(0,1,0,0,0,1,0,0,0,NA),
  Sender2 = c(0,0,0,1,0,0,0,0,0,NA),
  Sender3 = c(0,0,0,0,0,0,0,0,0,NA),
  Sender4 = c(0,1,0,0,0,0,1,0,0,NA),
  Sender5 = c(1,0,1,0,0,1,0,0,0,NA),
  Sender6 = c(0,1,0,0,0,0,1,0,0,NA),
  Sender7 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  Sender8 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  Sender9 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  Sender10 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
)

Example for-loop code:

library(dplyr, warn = FALSE)

# Collate all dataframes you want to 'clean' into a list
list_of_dfs <- mget(grep("^df.*", names(which(unlist(eapply(.GlobalEnv,is.data.frame)))), 
                         value = TRUE))

# create a list to store the results
clean_dfs <- list()

# run the for-loop
for (i in seq_along(list_of_dfs)) {
  clean_dfs[[i]] <- list_of_dfs[[i]] %>%
    select(where(~any(!is.na(.)))) %>%
    slice_head(n = ncol(.) - 1)
}

# Name each cleaned dataframe with the prefix "clean_"
names(clean_dfs) <- paste0("clean_", names(list_of_dfs))

# Export them to your work space
list2env(clean_dfs, envir = .GlobalEnv)
#> <environment: R_GlobalEnv>

# View them
print(clean_df1)
#>    Receiver Sender1 Sender2 Sender3 Sender4 Sender5 Sender6 Sender7
#> 1 Receiver1       0       0       0       0       1       0       0
#> 2 Receiver2       1       0       0       1       0       1       0
#> 3 Receiver3       0       0       0       0       1       0       1
#> 4 Receiver4       0       1       0       0       0       0       0
#> 5 Receiver5       0       0       0       0       0       0       1
#> 6 Receiver6       1       0       0       0       1       0       0
#> 7 Receiver7       0       0       0       1       0       1       0
print(clean_df2)
#>    Receiver Sender1 Sender2 Sender3 Sender4 Sender5 Sender6
#> 1 Receiver1       0       0       0       0       1       0
#> 2 Receiver2       1       0       0       1       0       1
#> 3 Receiver3       0       0       0       0       1       0
#> 4 Receiver4       0       1       0       0       0       0
#> 5 Receiver5       0       0       0       0       0       0
#> 6 Receiver6       1       0       0       0       1       0

Created on 2023-07-28 with reprex v2.0.2

Edit

I just re-read your question and saw you wanted to do this 200 times; if you have 200 individual xlsx files (1 sheet per file) you could read in each sheet, 'clean' it, then write it as a csv file:

library(tidyverse)
library(readxl)
library(fs)

path <- dir_ls(path = "~/Desktop", glob = "*.xlsx")
list_of_dfs <- map(path, read_excel)

for (i in seq_along(list_of_dfs)) {
  list_of_dfs[[i]] %>%
    select(where(~any(!. == "NA"))) %>%
    slice_head(n = ncol(.) - 1) %>%
    write_csv(., paste0(names(list_of_dfs)[[i]], ".csv"))
}

dir_ls(path = "~/Desktop/", glob = "*.xlsx.csv")
#> /Users/jared/Desktop/df1.xlsx.csv /Users/jared/Desktop/df2.xlsx.csv

Created on 2023-07-28 with reprex v2.0.2

Not sure if this helps with your actual use-case, but wanted to include it anyway, just in case it's useful to you. Good luck!

jared_mamrot
  • 22,354
  • 4
  • 21
  • 46