Below is the code I am currently using to summarize my data and it is working. My question is that I want to actually take the average of the "CE100" column vs the sum. How can I manipulate the below code to do this?
library(data.table, warn.conflicts = FALSE)
library(magrittr) ### MODIFIED
# library(lubridate, warn.conflicts = FALSE) ### MODIFIED
################
## PARAMETERS ##
################
# Set path of major source folder for raw transaction data
in_directory <- "C:/Users/NAME/Documents/Raw Data/"
# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ", "IA-IL", "IM-KZ", "LA-MI", "MJ-MS",
"MT-NV", "NW-OH", "OI-PZ", "QA-TN", "TO-UZ",
"VA-WA", "WB-ZZ")
# Set location for output
out_directory <- "C:/Users/NAME/Documents/YTD Master/"
out_filename <- "OUTPUT.csv"
# Set beginning and end of date range to be collected - year-month-day format
date_range <- c("2018-01-01", "2018-06-30") ### MODIFIED
# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
# date_filter <- TRUE ### MODIFIED
##########
## CODE ##
##########
starttime <- Sys.time()
# create vector of filenames to be processed
in_filenames <- list.files(
file.path(in_directory, in_subfolders),
pattern = "\\.txt$",
full.names = TRUE,
recursive = TRUE)
# filter filenames, only
selected_in_filenames <-
seq(as.Date(date_range[1]),
as.Date(date_range[2]), by = "1 month") %>%
format("%Y-%m") %>%
lapply(function(x) stringr::str_subset(in_filenames, x)) %>%
unlist()
# read and aggregate each file separetely
mastertable <- rbindlist(
lapply(selected_in_filenames, function(fn) {
message("Processing file: ", fn)
temptable <- fread(fn,
colClasses = c(CUSTOMER_TIER = "character"),
na.strings = "")
{ # Add columns
print(paste0("Adding columns - ", subfolder, " (", j," of ", length(in_subfolders), ")"))
print(Sys.time()-starttime)
temptable[, ':='(CustPart = paste0(CUST_ID, INV_ITEM_ID))]}
# aggregate file but filtered for date_range
temptable[INVOICE_DT %between% date_range,
lapply(.SD, sum), by = .(CustPart, QTR = quarter(INVOICE_DT), YEAR = year(INVOICE_DT)),
.SDcols = c("Ext Sale", "CE100")]
})
)[
# second aggregation overall
, lapply(.SD, sum), by = .(CustPart, QTR, YEAR), .SDcols = c("Ext Sale", "CE100")]
# Save Final table
print("Saving master table")
fwrite(mastertable, file.path(out_directory, out_filename))
# rm(mastertable) ### MODIFIED
print(Sys.time()-starttime)
mastertable
I have included all my code to show how I read my data in. If any other details will be necessary like some sample data to work with let me know.