0

I am trying to summarize by a certain column but am having issues with it not returning distinct values.

Below I have provided how I read in the data and the code I am using:

library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)

################
## PARAMETERS ##
################

# Set path of major source folder for raw transaction data
in_directory <- "C:/Users/NAME/Documents/Raw Data/"

# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ", "IA-IL", "IM-KZ", "LA-MI", "MJ-MS",
                      "MT-NV", "NW-OH", "OI-PZ", "QA-TN", "TO-UZ",
                      "VA-WA", "WB-ZZ")

# Set location for output
out_directory <- "C:/Users/NAME/Documents/YTD Master/"
out_filename <- "TEST2.csv"

# Set beginning and end of date range to be collected - year-month-day format
date_range <- interval(as.Date("2018-01-01"), as.Date("2018-05-31"))

# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
date_filter <- TRUE


##########
## CODE ##
##########

starttime <- Sys.time()
mastertable <- NULL

for (j in 1:length(in_subfolders)) {
  subfolder <- in_subfolders[j]
  sub_directory <- paste0(in_directory, subfolder, "/")

  ## IMPORT DATA
  in_filenames <- dir(sub_directory, pattern =".txt")

  for (i in 1:length(in_filenames)) {

    # Default value provided for when fast filtering is disabled.
    read_this_file <- TRUE

    # To fast filter the data, we choose to include or exclude an entire file based on the date of its first line.
    # WARNING: This is only a valid method if filtering by entire months, since that is the amount of data housed in each file.
    if (date_filter) {
      temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses=c(CUSTOMER_TIER = "character"),
                         na.strings = "", nrows = 1)
      temptable[, INVOICE_DT := as.Date(INVOICE_DT)]

      # If date matches, set read flag to TRUE.  If date does not match, set read flag to FALSE.
      read_this_file <- temptable[, INVOICE_DT] %within% date_range
    }


    if (read_this_file) {
      print(Sys.time()-starttime)
      print(paste0("Reading in ", in_filenames[i]))
      temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses=c(CUSTOMER_TIER = "character"),
                         na.strings = "")


          temptable <- temptable[, lapply(.SD, sum), by = INV_ITEM_ID,
                             .SDcols = c("Ext Sale")]

      # Combine into full list
      mastertable <- rbindlist(list(mastertable, temptable), use.names = TRUE)
      # Release unneeded memory
      rm(temptable)


    }

  }

}

# Save Final table
print("Saving master table")
fwrite(mastertable, paste0(out_directory, out_filename))
rm(mastertable)

print(Sys.time()-starttime)

The below code which is included above is the main focus I believe

 temptable <- temptable[, lapply(.SD, sum), by = INV_ITEM_ID,
                         .SDcols = c("Ext Sale")]

After running this i expect all sales to be combined by INV_ITEM_ID.

The problem is that it gives me about 5 million rows with only about 900,000 distinct values.

The sales column is not summarizing like I want it to.

Below is a subset of the Data I am using.

dput(head(datatable,30))

structure(list(INVOICE_DT = c("2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29"), BRANCH_CODE = c("AK001", "AK001", 
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001", 
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001", 
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001", 
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001"
), INVOICE_NO = c("AK0015522", "AK0015529", "AK0015515", "AK0015529", 
"AK0015514", "AK0015513", "AK0015519", "AK0015519", "AK0015520", 
"AK0015520", "AK0015520", "AK0015522", "AK0015520", "AK0015520", 
"AK0015520", "AK0015518", "AK0015525", "AK0015520", "AK0015519", 
"AK0015520", "AK0015525", "AK0015518", "AK0015529", "AK0015519", 
"AK0015525", "AK0015519", "AK0015519", "AK0015522", "AK0015522", 
"AK0015529"), INV_SEQ_NO = c(6L, 6L, 3L, 3L, 3L, 3L, 148L, 149L, 
91L, 45L, 162L, 34L, 125L, 39L, 159L, 56L, 64L, 127L, 160L, 93L, 
45L, 34L, 5L, 138L, 46L, 38L, 161L, 35L, 28L, 4L), INV_ITEM_ID = c("11239190", 
"11239190", "55692", "400050-131280", "0700535", "0800758", "69970", 
"0445050", "0819092", "0115160", "0115160", "0703316", "55692", 
"0115157", "0115157", "0115157", "440704-131280", "55618", "0703318", 
"0205428", "0702119", "0115157", "465278", "465278", "0702121", 
"15166", "0703315", "0703315", "0715718", "0243945"), ITEM_DESCR = c("Au-Ve-Co# 20898", 
"Au-Ve-Co# 20898", "MM HSS T TAP 16X1.5", "3/16\"\"\"\"H05703-100R Hs", 
"18/2 BLK SOOW", "3x1x1/4 180G FW", "BF SPEC SER HV1-6C", "1/4MP External Seat", 
"1-1/2xNHConditonDisc", "HCS 7/16-14x1 3/4 YZ8", "HCS 7/16-14x1 3/4 YZ8", 
"25AMP MINI AUTO FUSE", "MM HSS T TAP 16X1.5", "HCS 7/16-14x1.25 YZ8", 
"HCS 7/16-14x1.25 YZ8", "HCS 7/16-14x1.25 YZ8", "BPOLY MELBW 1/4TX1/8", 
"MM HSS P TAP 5 X 0.8", "MAXI-BLADE 20 AMP", "1-1/2\"\"\"\" T3 VF SURF", 
"16ga Red Wire/PPF", "HCS 7/16-14x1.25 YZ8", "80I 45 ELBOW 1/4 BLK", 
"80I 45 ELBOW 1/4 BLK", "16ga Blk Wire/PPF", "HCS 7/16-14x3.25 YZ8", 
"20AMP MINI AUTO FUSE", "20AMP MINI AUTO FUSE", "3/0-3/8\"\"\"\"HvyWalCprLug", 
"97053 Pink Marker"), STD_ITEM = c("Y", "Y", "Y", "Y", "Y", "Y", 
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", 
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), PRIVATE_LABEL = c("N", 
"N", "Y", "N", "N", "N", "N", "N", "N", "N", "N", "N", "Y", "N", 
"N", "N", "N", "Y", "N", "N", "N", "N", "Y", "Y", "N", "N", "N", 
"N", "N", "N"), CATEGORY_PATH1 = c("08.5-Fleet & Automotive", 
"08.5-Fleet & Automotive", "07-Cutting Tools & Metalworking", 
"08-Hydraulics & Pneumatics", "09-Electrical", "05-Abrasives", 
"08-Hydraulics & Pneumatics", "07.3-Plumbing", "05-Abrasives", 
"01-Fasteners", "01-Fasteners", "08.5-Fleet & Automotive", "07-Cutting Tools & Metalworking", 
"01-Fasteners", "01-Fasteners", "01-Fasteners", "08-Hydraulics & Pneumatics", 
"07-Cutting Tools & Metalworking", "08.5-Fleet & Automotive", 
"05-Abrasives", "08.5-Fleet & Automotive", "01-Fasteners", "07.3-Plumbing", 
"07.3-Plumbing", "08.5-Fleet & Automotive", "01-Fasteners", "08.5-Fleet & Automotive", 
"08.5-Fleet & Automotive", "08.5-Fleet & Automotive", "11-Chemicals & Paints"
), CATEGORY_PATH2 = c("03-Batteries & Accessories", "03-Batteries & Accessories", 
"04-Threading and Tapping", "02-Hose and Hose Reels", "05-Electrical Wire and Accessories", 
"03-Coated and Non-Woven Abrasives", "07-Valves & Accessories", 
"04-Valves & Valve Accessories", "03-Coated and Non-Woven Abrasives", 
"01-Bolts", "01-Bolts", "02-Fleet Electrical Products", "04-Threading and Tapping", 
"01-Bolts", "01-Bolts", "01-Bolts", "01-Fittings", "04-Threading and Tapping", 
"02-Fleet Electrical Products", "03-Coated and Non-Woven Abrasives", 
"02-Fleet Electrical Products", "01-Bolts", "01-Pipe Fittings", 
"01-Pipe Fittings", "02-Fleet Electrical Products", "01-Bolts", 
"02-Fleet Electrical Products", "02-Fleet Electrical Products", 
"02-Fleet Electrical Products", "11-Paints, Marking, and Accessories"
), CUST_ID = c("AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001"), CUSTOMER_TIER = c("Tier 3", "Tier 3", "Tier 3", 
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", 
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", 
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", 
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3"), 
    IS_VENDING = c("No", "No", "No", "No", "No", "No", "No", 
    "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", 
    "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", 
    "No", "No", "No"), SALE_PRICE = c(10.53, 10.53, 23.0175, 
    2.0251, 0.4755, 6.4725, 15.8235, 1.468, 1.0275, 0.3425, 0.3425, 
    0.3726, 23.0175, 0.2513, 0.2513, 0.2513, 2.7412, 5.7675, 
    2.226, 1.5579, 0.1418, 0.2513, 12.0835, 12.0835, 0.1297, 
    0.625, 0.3726, 0.3726, 3.514, 4.1112), TOTAL_COST = c(6.877449, 
    6.877449, 9.399562, 0.924873, 0.245128, 2.953287, 6.227787, 
    0.703494, 0.52035, 0.083454, 0.083454, 0.134285, 9.399562, 
    0.062686, 0.062686, 0.062686, 1.3493, 2.386274, 0.77503, 
    0.524996, 0.051595, 0.062686, 4.47782, 4.47782, 0.051628, 
    0.164998, 0.125893, 0.125893, 2.183983, 1.469989), POS_COST = c(6.877449, 
    6.877449, 9.399562, 0.924873, 0.245128, 2.953287, 6.227787, 
    0.703494, 0.52035, 0.083454, 0.083454, 0.134285, 9.399562, 
    0.062686, 0.062686, 0.062686, 1.3493, 2.386274, 0.77503, 
    0.524996, 0.051595, 0.062686, 4.47782, 4.47782, 0.051628, 
    0.164998, 0.125893, 0.125893, 2.183983, 1.469989), CE100 = c(6.55, 
    6.55, 9.25, 0.95, 0.23785, 2.81, 5.93127, 1.17, 0.508, 0.074946, 
    0.074946, 0.12, 9.25, 0.058626, 0.058626, 0.058626, 1.31, 
    2.35, 0.61, 0.5, 0.05172, 0.058626, 4.2844, 4.2844, 0.05172, 
    0.12255, 0.12, 0.12, 2.08, 1.43), CE110 = c(NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), CE120 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), CE200 = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0), CORP_PRICE = c(NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), QTY_SOLD = c(15L, 
    10L, 6L, 50L, 250L, 10L, 10L, 20L, 50L, 25L, 25L, 50L, 6L, 
    25L, 25L, 25L, 10L, 6L, 10L, 30L, 100L, 25L, 15L, 10L, 100L, 
    20L, 50L, 50L, 4L, 36L), PACKSLIP_WHSL = c(NA, NA, 30.69, 
    2.63, 0.7315, 8.63, 20.55, 1.96, 1.37, 0.685, 0.685, 0.5323, 
    30.69, 0.5025, 0.5025, 0.5025, 3.56, 7.69, 3.18, 1.82, 0.2145, 
    0.5025, 18.59, 18.59, 0.1961, 1.25, 0.5323, 0.5323, 5.02, 
    5.71), PRICING_GROUP = c("PGB", "PGB", "PGB", "PGB", "PGB", 
    "PGB", "PGG", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", 
    "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", 
    "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB"), PGG_MIN_PRICE = c(257.9043375, 
    171.936225, 184.14, 131.5, 175.56, 86.3, 154.125, 39.2, 68.5, 
    15.07, 15.07, 26.615, 184.14, 11.055, 11.055, 11.055, 35.6, 
    46.14, 31.8, 54.6, 21.45, 11.055, 253.7535, 185.9, 19.61, 
    22, 26.615, 26.615, 20.08, 205.56), PGY_MIN_PRICE = c(206.32347, 
    137.54898, 171.2502, 128.87, 170.07375, 80.259, 147.96, 36.456, 
    63.705, 13.3575, 13.3575, 24.75195, 171.2502, 9.79875, 9.79875, 
    9.79875, 33.108, 45.2172, 29.574, 50.778, 19.9485, 9.79875, 
    245.388, 163.592, 18.2373, 19.5, 24.75195, 24.75195, 18.6744, 
    191.1708), PGR_MIN_PRICE = c(171.936225, 114.62415, 162.0432, 
    124.925, 166.41625, 75.944, 141.795, 35.672, 60.28, 9.9325, 
    9.9325, 24.21965, 162.0432, 7.28625, 7.28625, 7.28625, 32.396, 
    43.833, 28.938, 48.048, 19.5195, 7.28625, 239.811, 159.874, 
    17.8451, 14.5, 24.21965, 24.21965, 18.2728, 180.8928), `Ext Sale` = c(157.95, 
    105.3, 138.105, 101.255, 118.875, 64.725, 158.235, 29.36, 
    51.375, 8.5625, 8.5625, 18.63, 138.105, 6.2825, 6.2825, 6.2825, 
    27.412, 34.605, 22.26, 46.737, 14.18, 6.2825, 181.2525, 120.835, 
    12.97, 12.5, 18.63, 18.63, 14.056, 148.0032), `Ext Total Cost` = c(103.161735, 
    68.77449, 56.397372, 46.24365, 61.282, 29.53287, 62.27787, 
    14.06988, 26.0175, 2.08635, 2.08635, 6.71425, 56.397372, 
    1.56715, 1.56715, 1.56715, 13.493, 14.317644, 7.7503, 15.74988, 
    5.1595, 1.56715, 67.1673, 44.7782, 5.1628, 3.29996, 6.29465, 
    6.29465, 8.735932, 52.919604), cust_part = c("AK001000111239190", 
    "AK001000111239190", "AK001000155692", "AK0010001400050-131280", 
    "AK00100010700535", "AK00100010800758", "AK001000169970", 
    "AK00100010445050", "AK00100010819092", "AK00100010115160", 
    "AK00100010115160", "AK00100010703316", "AK001000155692", 
    "AK00100010115157", "AK00100010115157", "AK00100010115157", 
    "AK0010001440704-131280", "AK001000155618", "AK00100010703318", 
    "AK00100010205428", "AK00100010702119", "AK00100010115157", 
    "AK0010001465278", "AK0010001465278", "AK00100010702121", 
    "AK001000115166", "AK00100010703315", "AK00100010703315", 
    "AK00100010715718", "AK00100010243945")), .Names = c("INVOICE_DT", 
"BRANCH_CODE", "INVOICE_NO", "INV_SEQ_NO", "INV_ITEM_ID", "ITEM_DESCR", 
"STD_ITEM", "PRIVATE_LABEL", "CATEGORY_PATH1", "CATEGORY_PATH2", 
"CUST_ID", "CUSTOMER_TIER", "IS_VENDING", "SALE_PRICE", "TOTAL_COST", 
"POS_COST", "CE100", "CE110", "CE120", "CE200", "CORP_PRICE", 
"QTY_SOLD", "PACKSLIP_WHSL", "PRICING_GROUP", "PGG_MIN_PRICE", 
"PGY_MIN_PRICE", "PGR_MIN_PRICE", "Ext Sale", "Ext Total Cost", 
"cust_part"), .internal.selfref = <pointer: (nil)>, row.names = c(NA, 
30L), class = c("data.table", "data.frame"))
J fast
  • 53
  • 8
  • 2
    It would help if you could provide a minimal reproducible example https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example. Any subset with >1 group should work. Maybe something like `dput(df[, head(.SD, 10), INV_ITEM_ID])` – IceCreamToucan Jun 29 '18 at 15:42
  • 1
    you should also share the sample data, how can we just solve by looking at code? – user5249203 Jun 29 '18 at 15:42
  • I have edited the question to include this. – J fast Jun 29 '18 at 15:54
  • Please use the output of `dput(datatable)`instead of `str()`. – SeGa Jun 29 '18 at 16:22
  • SeGa, I used str() to show the data types. Below that i used subset() for the data. I have millions of rows so dput() was not the best option. – J fast Jun 29 '18 at 16:27
  • @Jfast: you don't need to share all the data. Using `dput(head(datatable, 30))` should do it – Tung Jun 29 '18 at 18:23
  • Tung 11 i have replaced it with that. – J fast Jun 29 '18 at 18:35

0 Answers0