I am new to R programming and most of my experience thus far is with using highly structured rectangular data from a .csv or .xlsx. But now I've been handed about 30 spreadsheets of budget data that look like this:
And in order to work with them, I'd like to get them into a more friendly format (not exactly tidy b/c of the Q1 to Q4 could/should be a single variable -- but I can fix that later with pivot_longer), like this:
Searching SO, the closest problem/solution I found was this: R importing semi-unstructured data CSV, but that example contains a series of structured tables that do not require the modification mine does, plus, it is a text file converting to character vectors, and I have Excel workbooks with multiple worksheets (I only need 1 of the sheets).
Here's what I've tried so far:
library(tidyverse)
library(xlsx)
# Straight read in the worksheet as-is
df <- read_xlsx(path = "filename.xlsx", sheet = "worksheet", col_names = FALSE)
# Get the location name into its own column, then delete row 1 since it's not needed
df <- df %>%
mutate(location = df[[1,1]])
df <- df[-c(1),]
# Add a column and initialize it to "empty"
df <- df %>%
add_column(budget_type = "empty")
# Now loop through the dataframe in Column 1, search for the keyword(s) and place
# them in the last "budget_type" column
for (row in 1:nrow(df)){
print(df[[row,1]])
if (df[[row,1]] %in% c("Baseline","Scope Changes")){
budget_type <- df[[row,1]]
}
if (!is.na(df[[row,1]])){
if (str_detect(df[[row,1]], "[0-9]{4}") == TRUE){
df[[row, "budget_type"]] <- budget_type
}
}
}
# ...and from here I could write another loop going from bottom to top seeking
# the categories and placing them in another created column, and finally delete the rows
# that are empty, total rows, or unnecessary header rows.
My question is: Is there an obviously better way to do this in R in place of the loops and the general approach I'm describing to get my data in a tidy format?
Thank you in advance.
EDIT 6/7/2021:
It appears I cannot attach the Excel file, but if I'm following the "minimal reproducible example" guidelines correctly, here is the unprocessed data from dput() after reading in the data from Excel:
structure(list(...1 = c("Mehoopany", NA, "CLASS CODE", "Baseline", NA, "0201", "0300", "0301", NA, NA, "5500", "8245", "8260", NA, NA, "5710", "8224", "8235", NA, NA, NA, NA, "CLASS CODE", "Scope Changes", NA, "0201", "0300", "0301", NA, NA, "5500", "8245", "8260", NA, NA, "5710", "8224", "8235", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), ...2 = c(NA, NA, "Classification", NA, NA, "Cleaning, Spills, and Trash / Recycle Bin Pickup", "Specialized Cleaning", "Window Cleaning", "Cleaning", NA, "Gen Office Exp", "Wellness / Fitness Centers", "Meeting Rooms", "Employee Convenience", NA, "Reception", "Photographic Services", "Mail Room Services", "Mail Room & Reception", NA, "Total", NA, "Classification", NA, NA, "Cleaning, Spills, and Trash / Recycle Bin Pickup", "Specialized Cleaning", "Window Cleaning", "Cleaning", NA, "Gen Office Exp", "Wellness / Fitness Centers", "Meeting Rooms", "Employee Convenience", NA, "Reception", "Photographic Services", "Mail Room Services", "Mail Room & Reception", NA, "Total", NA, NA, NA, NA, NA, NA, NA, NA, NA), ...3 = c(NA, NA, "FY2021 Phasing", NA, "Q1", "1205", "0", "0", "1205", NA, "0", "0", "174", "174", NA, "0", "0", "1453.625", "1453.625", NA, "2832.625", NA, "FY2021 Phasing", NA, "Q1", "25", "0", "0", "25", NA, "0", "0", "37", "37", NA, "0", "17", "0", "17", NA, "79", NA, NA, NA, NA, NA, NA, NA, NA, NA), ...4 = c(NA, NA, NA, NA, "Q2", "1205", "0", "0", "1205", NA, "0", "0", "174", "174", NA, "0", "0", "1453.625", "1453.625", NA, "2832.625", NA, NA, NA, "Q2", "25", "0", "0", "25", NA, "0", "0", "37", "37", NA, "0", "17", "0", "17", NA, "79", NA, NA, NA, NA, NA, NA, NA, NA, NA), ...5 = c(NA, NA, NA, NA, "Q3", "1205", "0", "0", "1205", NA, "0", "0", "174", "174", NA, "0", "0", "1453.625", "1453.625", NA, "2832.625", NA, NA, NA, "Q3", "25", "0", "0", "25", NA, "0", "0", "37", "37", NA, "0", "17", "0", "17", NA, "79", NA, NA, NA, NA, NA, NA, NA, NA, NA), ...6 = c(NA, NA, NA, NA, "Q4", "1205", "0", "0", "1205", NA, "0", "0", "174", "174", NA, "0", "0", "1453.625", "1453.625", NA, "2832.625", NA, NA, NA, "Q4", "25", "0", "0", "25", NA, "0", "0", "37", "37", NA, "0", "17", "0", "17", NA, "79", NA, NA, NA, NA, NA, NA, NA, NA, NA), ...7 = c(NA, NA, NA, NA, "Total", "4820", "0", "0", "4820", NA, "0", "0", "696", "696", NA, "0", "0", "5814.5", "5814.5", NA, "11330.5", NA, NA, NA, "Total", "100", "0", "0", "100", NA, "0", "0", "148", "148", NA, "0", "68", "0", "68", NA, "316", NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"))