I am trying to summarize by a certain column but am having issues with it not returning distinct values.
Below I have provided how I read in the data and the code I am using:
library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
################
## PARAMETERS ##
################
# Set path of major source folder for raw transaction data
in_directory <- "C:/Users/NAME/Documents/Raw Data/"
# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ", "IA-IL", "IM-KZ", "LA-MI", "MJ-MS",
"MT-NV", "NW-OH", "OI-PZ", "QA-TN", "TO-UZ",
"VA-WA", "WB-ZZ")
# Set location for output
out_directory <- "C:/Users/NAME/Documents/YTD Master/"
out_filename <- "TEST2.csv"
# Set beginning and end of date range to be collected - year-month-day format
date_range <- interval(as.Date("2018-01-01"), as.Date("2018-05-31"))
# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
date_filter <- TRUE
##########
## CODE ##
##########
starttime <- Sys.time()
mastertable <- NULL
for (j in 1:length(in_subfolders)) {
subfolder <- in_subfolders[j]
sub_directory <- paste0(in_directory, subfolder, "/")
## IMPORT DATA
in_filenames <- dir(sub_directory, pattern =".txt")
for (i in 1:length(in_filenames)) {
# Default value provided for when fast filtering is disabled.
read_this_file <- TRUE
# To fast filter the data, we choose to include or exclude an entire file based on the date of its first line.
# WARNING: This is only a valid method if filtering by entire months, since that is the amount of data housed in each file.
if (date_filter) {
temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses=c(CUSTOMER_TIER = "character"),
na.strings = "", nrows = 1)
temptable[, INVOICE_DT := as.Date(INVOICE_DT)]
# If date matches, set read flag to TRUE. If date does not match, set read flag to FALSE.
read_this_file <- temptable[, INVOICE_DT] %within% date_range
}
if (read_this_file) {
print(Sys.time()-starttime)
print(paste0("Reading in ", in_filenames[i]))
temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses=c(CUSTOMER_TIER = "character"),
na.strings = "")
temptable <- temptable[, lapply(.SD, sum), by = INV_ITEM_ID,
.SDcols = c("Ext Sale")]
# Combine into full list
mastertable <- rbindlist(list(mastertable, temptable), use.names = TRUE)
# Release unneeded memory
rm(temptable)
}
}
}
# Save Final table
print("Saving master table")
fwrite(mastertable, paste0(out_directory, out_filename))
rm(mastertable)
print(Sys.time()-starttime)
The below code which is included above is the main focus I believe
temptable <- temptable[, lapply(.SD, sum), by = INV_ITEM_ID,
.SDcols = c("Ext Sale")]
After running this i expect all sales to be combined by INV_ITEM_ID.
The problem is that it gives me about 5 million rows with only about 900,000 distinct values.
The sales column is not summarizing like I want it to.
Below is a subset of the Data I am using.
dput(head(datatable,30))
structure(list(INVOICE_DT = c("2016-01-29", "2016-01-29", "2016-01-29",
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29",
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29",
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29",
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29",
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29",
"2016-01-29", "2016-01-29"), BRANCH_CODE = c("AK001", "AK001",
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001",
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001",
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001",
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001"
), INVOICE_NO = c("AK0015522", "AK0015529", "AK0015515", "AK0015529",
"AK0015514", "AK0015513", "AK0015519", "AK0015519", "AK0015520",
"AK0015520", "AK0015520", "AK0015522", "AK0015520", "AK0015520",
"AK0015520", "AK0015518", "AK0015525", "AK0015520", "AK0015519",
"AK0015520", "AK0015525", "AK0015518", "AK0015529", "AK0015519",
"AK0015525", "AK0015519", "AK0015519", "AK0015522", "AK0015522",
"AK0015529"), INV_SEQ_NO = c(6L, 6L, 3L, 3L, 3L, 3L, 148L, 149L,
91L, 45L, 162L, 34L, 125L, 39L, 159L, 56L, 64L, 127L, 160L, 93L,
45L, 34L, 5L, 138L, 46L, 38L, 161L, 35L, 28L, 4L), INV_ITEM_ID = c("11239190",
"11239190", "55692", "400050-131280", "0700535", "0800758", "69970",
"0445050", "0819092", "0115160", "0115160", "0703316", "55692",
"0115157", "0115157", "0115157", "440704-131280", "55618", "0703318",
"0205428", "0702119", "0115157", "465278", "465278", "0702121",
"15166", "0703315", "0703315", "0715718", "0243945"), ITEM_DESCR = c("Au-Ve-Co# 20898",
"Au-Ve-Co# 20898", "MM HSS T TAP 16X1.5", "3/16\"\"\"\"H05703-100R Hs",
"18/2 BLK SOOW", "3x1x1/4 180G FW", "BF SPEC SER HV1-6C", "1/4MP External Seat",
"1-1/2xNHConditonDisc", "HCS 7/16-14x1 3/4 YZ8", "HCS 7/16-14x1 3/4 YZ8",
"25AMP MINI AUTO FUSE", "MM HSS T TAP 16X1.5", "HCS 7/16-14x1.25 YZ8",
"HCS 7/16-14x1.25 YZ8", "HCS 7/16-14x1.25 YZ8", "BPOLY MELBW 1/4TX1/8",
"MM HSS P TAP 5 X 0.8", "MAXI-BLADE 20 AMP", "1-1/2\"\"\"\" T3 VF SURF",
"16ga Red Wire/PPF", "HCS 7/16-14x1.25 YZ8", "80I 45 ELBOW 1/4 BLK",
"80I 45 ELBOW 1/4 BLK", "16ga Blk Wire/PPF", "HCS 7/16-14x3.25 YZ8",
"20AMP MINI AUTO FUSE", "20AMP MINI AUTO FUSE", "3/0-3/8\"\"\"\"HvyWalCprLug",
"97053 Pink Marker"), STD_ITEM = c("Y", "Y", "Y", "Y", "Y", "Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), PRIVATE_LABEL = c("N",
"N", "Y", "N", "N", "N", "N", "N", "N", "N", "N", "N", "Y", "N",
"N", "N", "N", "Y", "N", "N", "N", "N", "Y", "Y", "N", "N", "N",
"N", "N", "N"), CATEGORY_PATH1 = c("08.5-Fleet & Automotive",
"08.5-Fleet & Automotive", "07-Cutting Tools & Metalworking",
"08-Hydraulics & Pneumatics", "09-Electrical", "05-Abrasives",
"08-Hydraulics & Pneumatics", "07.3-Plumbing", "05-Abrasives",
"01-Fasteners", "01-Fasteners", "08.5-Fleet & Automotive", "07-Cutting Tools & Metalworking",
"01-Fasteners", "01-Fasteners", "01-Fasteners", "08-Hydraulics & Pneumatics",
"07-Cutting Tools & Metalworking", "08.5-Fleet & Automotive",
"05-Abrasives", "08.5-Fleet & Automotive", "01-Fasteners", "07.3-Plumbing",
"07.3-Plumbing", "08.5-Fleet & Automotive", "01-Fasteners", "08.5-Fleet & Automotive",
"08.5-Fleet & Automotive", "08.5-Fleet & Automotive", "11-Chemicals & Paints"
), CATEGORY_PATH2 = c("03-Batteries & Accessories", "03-Batteries & Accessories",
"04-Threading and Tapping", "02-Hose and Hose Reels", "05-Electrical Wire and Accessories",
"03-Coated and Non-Woven Abrasives", "07-Valves & Accessories",
"04-Valves & Valve Accessories", "03-Coated and Non-Woven Abrasives",
"01-Bolts", "01-Bolts", "02-Fleet Electrical Products", "04-Threading and Tapping",
"01-Bolts", "01-Bolts", "01-Bolts", "01-Fittings", "04-Threading and Tapping",
"02-Fleet Electrical Products", "03-Coated and Non-Woven Abrasives",
"02-Fleet Electrical Products", "01-Bolts", "01-Pipe Fittings",
"01-Pipe Fittings", "02-Fleet Electrical Products", "01-Bolts",
"02-Fleet Electrical Products", "02-Fleet Electrical Products",
"02-Fleet Electrical Products", "11-Paints, Marking, and Accessories"
), CUST_ID = c("AK0010001", "AK0010001", "AK0010001", "AK0010001",
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001",
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001",
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001",
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001",
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001",
"AK0010001"), CUSTOMER_TIER = c("Tier 3", "Tier 3", "Tier 3",
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3",
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3",
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3",
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3"),
IS_VENDING = c("No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No"), SALE_PRICE = c(10.53, 10.53, 23.0175,
2.0251, 0.4755, 6.4725, 15.8235, 1.468, 1.0275, 0.3425, 0.3425,
0.3726, 23.0175, 0.2513, 0.2513, 0.2513, 2.7412, 5.7675,
2.226, 1.5579, 0.1418, 0.2513, 12.0835, 12.0835, 0.1297,
0.625, 0.3726, 0.3726, 3.514, 4.1112), TOTAL_COST = c(6.877449,
6.877449, 9.399562, 0.924873, 0.245128, 2.953287, 6.227787,
0.703494, 0.52035, 0.083454, 0.083454, 0.134285, 9.399562,
0.062686, 0.062686, 0.062686, 1.3493, 2.386274, 0.77503,
0.524996, 0.051595, 0.062686, 4.47782, 4.47782, 0.051628,
0.164998, 0.125893, 0.125893, 2.183983, 1.469989), POS_COST = c(6.877449,
6.877449, 9.399562, 0.924873, 0.245128, 2.953287, 6.227787,
0.703494, 0.52035, 0.083454, 0.083454, 0.134285, 9.399562,
0.062686, 0.062686, 0.062686, 1.3493, 2.386274, 0.77503,
0.524996, 0.051595, 0.062686, 4.47782, 4.47782, 0.051628,
0.164998, 0.125893, 0.125893, 2.183983, 1.469989), CE100 = c(6.55,
6.55, 9.25, 0.95, 0.23785, 2.81, 5.93127, 1.17, 0.508, 0.074946,
0.074946, 0.12, 9.25, 0.058626, 0.058626, 0.058626, 1.31,
2.35, 0.61, 0.5, 0.05172, 0.058626, 4.2844, 4.2844, 0.05172,
0.12255, 0.12, 0.12, 2.08, 1.43), CE110 = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), CE120 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), CE200 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), CORP_PRICE = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), QTY_SOLD = c(15L,
10L, 6L, 50L, 250L, 10L, 10L, 20L, 50L, 25L, 25L, 50L, 6L,
25L, 25L, 25L, 10L, 6L, 10L, 30L, 100L, 25L, 15L, 10L, 100L,
20L, 50L, 50L, 4L, 36L), PACKSLIP_WHSL = c(NA, NA, 30.69,
2.63, 0.7315, 8.63, 20.55, 1.96, 1.37, 0.685, 0.685, 0.5323,
30.69, 0.5025, 0.5025, 0.5025, 3.56, 7.69, 3.18, 1.82, 0.2145,
0.5025, 18.59, 18.59, 0.1961, 1.25, 0.5323, 0.5323, 5.02,
5.71), PRICING_GROUP = c("PGB", "PGB", "PGB", "PGB", "PGB",
"PGB", "PGG", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB",
"PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB",
"PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB"), PGG_MIN_PRICE = c(257.9043375,
171.936225, 184.14, 131.5, 175.56, 86.3, 154.125, 39.2, 68.5,
15.07, 15.07, 26.615, 184.14, 11.055, 11.055, 11.055, 35.6,
46.14, 31.8, 54.6, 21.45, 11.055, 253.7535, 185.9, 19.61,
22, 26.615, 26.615, 20.08, 205.56), PGY_MIN_PRICE = c(206.32347,
137.54898, 171.2502, 128.87, 170.07375, 80.259, 147.96, 36.456,
63.705, 13.3575, 13.3575, 24.75195, 171.2502, 9.79875, 9.79875,
9.79875, 33.108, 45.2172, 29.574, 50.778, 19.9485, 9.79875,
245.388, 163.592, 18.2373, 19.5, 24.75195, 24.75195, 18.6744,
191.1708), PGR_MIN_PRICE = c(171.936225, 114.62415, 162.0432,
124.925, 166.41625, 75.944, 141.795, 35.672, 60.28, 9.9325,
9.9325, 24.21965, 162.0432, 7.28625, 7.28625, 7.28625, 32.396,
43.833, 28.938, 48.048, 19.5195, 7.28625, 239.811, 159.874,
17.8451, 14.5, 24.21965, 24.21965, 18.2728, 180.8928), `Ext Sale` = c(157.95,
105.3, 138.105, 101.255, 118.875, 64.725, 158.235, 29.36,
51.375, 8.5625, 8.5625, 18.63, 138.105, 6.2825, 6.2825, 6.2825,
27.412, 34.605, 22.26, 46.737, 14.18, 6.2825, 181.2525, 120.835,
12.97, 12.5, 18.63, 18.63, 14.056, 148.0032), `Ext Total Cost` = c(103.161735,
68.77449, 56.397372, 46.24365, 61.282, 29.53287, 62.27787,
14.06988, 26.0175, 2.08635, 2.08635, 6.71425, 56.397372,
1.56715, 1.56715, 1.56715, 13.493, 14.317644, 7.7503, 15.74988,
5.1595, 1.56715, 67.1673, 44.7782, 5.1628, 3.29996, 6.29465,
6.29465, 8.735932, 52.919604), cust_part = c("AK001000111239190",
"AK001000111239190", "AK001000155692", "AK0010001400050-131280",
"AK00100010700535", "AK00100010800758", "AK001000169970",
"AK00100010445050", "AK00100010819092", "AK00100010115160",
"AK00100010115160", "AK00100010703316", "AK001000155692",
"AK00100010115157", "AK00100010115157", "AK00100010115157",
"AK0010001440704-131280", "AK001000155618", "AK00100010703318",
"AK00100010205428", "AK00100010702119", "AK00100010115157",
"AK0010001465278", "AK0010001465278", "AK00100010702121",
"AK001000115166", "AK00100010703315", "AK00100010703315",
"AK00100010715718", "AK00100010243945")), .Names = c("INVOICE_DT",
"BRANCH_CODE", "INVOICE_NO", "INV_SEQ_NO", "INV_ITEM_ID", "ITEM_DESCR",
"STD_ITEM", "PRIVATE_LABEL", "CATEGORY_PATH1", "CATEGORY_PATH2",
"CUST_ID", "CUSTOMER_TIER", "IS_VENDING", "SALE_PRICE", "TOTAL_COST",
"POS_COST", "CE100", "CE110", "CE120", "CE200", "CORP_PRICE",
"QTY_SOLD", "PACKSLIP_WHSL", "PRICING_GROUP", "PGG_MIN_PRICE",
"PGY_MIN_PRICE", "PGR_MIN_PRICE", "Ext Sale", "Ext Total Cost",
"cust_part"), .internal.selfref = <pointer: (nil)>, row.names = c(NA,
30L), class = c("data.table", "data.frame"))