0

I am wanting to summarize my data ultimately by quarter. which i am able to do with the following code but since INV_ITEM_ID in the code below is not numeric type it does not allow me to add it to the list.

temptable[ ,lapply(.SD, sum), by = quarter(INVOICE_DT), 
    .SDcols = c("INV_ITEM_ID","Ext Sale", "Ext Total Cost", "CE100", "CE110","CE120","QTY_SOLD","PACKSLIP_WHSL")]

When running the script i get the following error

Error in gsum(INV_ITEM_ID) : 
  Type 'character' not supported by GForce sum (gsum). Either add the prefix base::sum(.) or turn off GForce optimization using options(datatable.optimize=1)

Below I have provided a subset of my data (first 30 rows).

library(data.table)
temptable = setDT(structure(list(INVOICE_DT = c("2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", "2016-01-29", 
"2016-01-29", "2016-01-29"), BRANCH_CODE = c("AK001", "AK001", 
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001", 
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001", 
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001", 
"AK001", "AK001", "AK001", "AK001", "AK001", "AK001", "AK001"
), INVOICE_NO = c("AK0015522", "AK0015529", "AK0015515", "AK0015529", 
"AK0015514", "AK0015513", "AK0015519", "AK0015519", "AK0015520", 
"AK0015520", "AK0015520", "AK0015522", "AK0015520", "AK0015520", 
"AK0015520", "AK0015518", "AK0015525", "AK0015520", "AK0015519", 
"AK0015520", "AK0015525", "AK0015518", "AK0015529", "AK0015519", 
"AK0015525", "AK0015519", "AK0015519", "AK0015522", "AK0015522", 
"AK0015529"), INV_SEQ_NO = c(6L, 6L, 3L, 3L, 3L, 3L, 148L, 149L, 
91L, 45L, 162L, 34L, 125L, 39L, 159L, 56L, 64L, 127L, 160L, 93L, 
45L, 34L, 5L, 138L, 46L, 38L, 161L, 35L, 28L, 4L), INV_ITEM_ID = c("11239190", 
"11239190", "55692", "400050-131280", "0700535", "0800758", "69970", 
"0445050", "0819092", "0115160", "0115160", "0703316", "55692", 
"0115157", "0115157", "0115157", "440704-131280", "55618", "0703318", 
"0205428", "0702119", "0115157", "465278", "465278", "0702121", 
"15166", "0703315", "0703315", "0715718", "0243945"), ITEM_DESCR = c("Au-Ve-Co# 20898", 
"Au-Ve-Co# 20898", "MM HSS T TAP 16X1.5", "3/16\"\"\"\"H05703-100R Hs", 
"18/2 BLK SOOW", "3x1x1/4 180G FW", "BF SPEC SER HV1-6C", "1/4MP External Seat", 
"1-1/2xNHConditonDisc", "HCS 7/16-14x1 3/4 YZ8", "HCS 7/16-14x1 3/4 YZ8", 
"25AMP MINI AUTO FUSE", "MM HSS T TAP 16X1.5", "HCS 7/16-14x1.25 YZ8", 
"HCS 7/16-14x1.25 YZ8", "HCS 7/16-14x1.25 YZ8", "BPOLY MELBW 1/4TX1/8", 
"MM HSS P TAP 5 X 0.8", "MAXI-BLADE 20 AMP", "1-1/2\"\"\"\" T3 VF SURF", 
"16ga Red Wire/PPF", "HCS 7/16-14x1.25 YZ8", "80I 45 ELBOW 1/4 BLK", 
"80I 45 ELBOW 1/4 BLK", "16ga Blk Wire/PPF", "HCS 7/16-14x3.25 YZ8", 
"20AMP MINI AUTO FUSE", "20AMP MINI AUTO FUSE", "3/0-3/8\"\"\"\"HvyWalCprLug", 
"97053 Pink Marker"), STD_ITEM = c("Y", "Y", "Y", "Y", "Y", "Y", 
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", 
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), PRIVATE_LABEL = c("N", 
"N", "Y", "N", "N", "N", "N", "N", "N", "N", "N", "N", "Y", "N", 
"N", "N", "N", "Y", "N", "N", "N", "N", "Y", "Y", "N", "N", "N", 
"N", "N", "N"), CATEGORY_PATH1 = c("08.5-Fleet & Automotive", 
"08.5-Fleet & Automotive", "07-Cutting Tools & Metalworking", 
"08-Hydraulics & Pneumatics", "09-Electrical", "05-Abrasives", 
"08-Hydraulics & Pneumatics", "07.3-Plumbing", "05-Abrasives", 
"01-Fasteners", "01-Fasteners", "08.5-Fleet & Automotive", "07-Cutting Tools & Metalworking", 
"01-Fasteners", "01-Fasteners", "01-Fasteners", "08-Hydraulics & Pneumatics", 
"07-Cutting Tools & Metalworking", "08.5-Fleet & Automotive", 
"05-Abrasives", "08.5-Fleet & Automotive", "01-Fasteners", "07.3-Plumbing", 
"07.3-Plumbing", "08.5-Fleet & Automotive", "01-Fasteners", "08.5-Fleet & Automotive", 
"08.5-Fleet & Automotive", "08.5-Fleet & Automotive", "11-Chemicals & Paints"
), CATEGORY_PATH2 = c("03-Batteries & Accessories", "03-Batteries & Accessories", 
"04-Threading and Tapping", "02-Hose and Hose Reels", "05-Electrical Wire and Accessories", 
"03-Coated and Non-Woven Abrasives", "07-Valves & Accessories", 
"04-Valves & Valve Accessories", "03-Coated and Non-Woven Abrasives", 
"01-Bolts", "01-Bolts", "02-Fleet Electrical Products", "04-Threading and Tapping", 
"01-Bolts", "01-Bolts", "01-Bolts", "01-Fittings", "04-Threading and Tapping", 
"02-Fleet Electrical Products", "03-Coated and Non-Woven Abrasives", 
"02-Fleet Electrical Products", "01-Bolts", "01-Pipe Fittings", 
"01-Pipe Fittings", "02-Fleet Electrical Products", "01-Bolts", 
"02-Fleet Electrical Products", "02-Fleet Electrical Products", 
"02-Fleet Electrical Products", "11-Paints, Marking, and Accessories"
), CUST_ID = c("AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001", "AK0010001", "AK0010001", "AK0010001", "AK0010001", 
"AK0010001"), CUSTOMER_TIER = c("Tier 3", "Tier 3", "Tier 3", 
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", 
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", 
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", 
"Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3", "Tier 3"), 
    IS_VENDING = c("No", "No", "No", "No", "No", "No", "No", 
    "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", 
    "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", 
    "No", "No", "No"), SALE_PRICE = c(10.53, 10.53, 23.0175, 
    2.0251, 0.4755, 6.4725, 15.8235, 1.468, 1.0275, 0.3425, 0.3425, 
    0.3726, 23.0175, 0.2513, 0.2513, 0.2513, 2.7412, 5.7675, 
    2.226, 1.5579, 0.1418, 0.2513, 12.0835, 12.0835, 0.1297, 
    0.625, 0.3726, 0.3726, 3.514, 4.1112), TOTAL_COST = c(6.877449, 
    6.877449, 9.399562, 0.924873, 0.245128, 2.953287, 6.227787, 
    0.703494, 0.52035, 0.083454, 0.083454, 0.134285, 9.399562, 
    0.062686, 0.062686, 0.062686, 1.3493, 2.386274, 0.77503, 
    0.524996, 0.051595, 0.062686, 4.47782, 4.47782, 0.051628, 
    0.164998, 0.125893, 0.125893, 2.183983, 1.469989), POS_COST = c(6.877449, 
    6.877449, 9.399562, 0.924873, 0.245128, 2.953287, 6.227787, 
    0.703494, 0.52035, 0.083454, 0.083454, 0.134285, 9.399562, 
    0.062686, 0.062686, 0.062686, 1.3493, 2.386274, 0.77503, 
    0.524996, 0.051595, 0.062686, 4.47782, 4.47782, 0.051628, 
    0.164998, 0.125893, 0.125893, 2.183983, 1.469989), CE100 = c(6.55, 
    6.55, 9.25, 0.95, 0.23785, 2.81, 5.93127, 1.17, 0.508, 0.074946, 
    0.074946, 0.12, 9.25, 0.058626, 0.058626, 0.058626, 1.31, 
    2.35, 0.61, 0.5, 0.05172, 0.058626, 4.2844, 4.2844, 0.05172, 
    0.12255, 0.12, 0.12, 2.08, 1.43), CE110 = c(NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), CE120 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), CE200 = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0), CORP_PRICE = c(NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), QTY_SOLD = c(15L, 
    10L, 6L, 50L, 250L, 10L, 10L, 20L, 50L, 25L, 25L, 50L, 6L, 
    25L, 25L, 25L, 10L, 6L, 10L, 30L, 100L, 25L, 15L, 10L, 100L, 
    20L, 50L, 50L, 4L, 36L), PACKSLIP_WHSL = c(NA, NA, 30.69, 
    2.63, 0.7315, 8.63, 20.55, 1.96, 1.37, 0.685, 0.685, 0.5323, 
    30.69, 0.5025, 0.5025, 0.5025, 3.56, 7.69, 3.18, 1.82, 0.2145, 
    0.5025, 18.59, 18.59, 0.1961, 1.25, 0.5323, 0.5323, 5.02, 
    5.71), PRICING_GROUP = c("PGB", "PGB", "PGB", "PGB", "PGB", 
    "PGB", "PGG", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", 
    "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", 
    "PGB", "PGB", "PGB", "PGB", "PGB", "PGB", "PGB"), PGG_MIN_PRICE = c(257.9043375, 
    171.936225, 184.14, 131.5, 175.56, 86.3, 154.125, 39.2, 68.5, 
    15.07, 15.07, 26.615, 184.14, 11.055, 11.055, 11.055, 35.6, 
    46.14, 31.8, 54.6, 21.45, 11.055, 253.7535, 185.9, 19.61, 
    22, 26.615, 26.615, 20.08, 205.56), PGY_MIN_PRICE = c(206.32347, 
    137.54898, 171.2502, 128.87, 170.07375, 80.259, 147.96, 36.456, 
    63.705, 13.3575, 13.3575, 24.75195, 171.2502, 9.79875, 9.79875, 
    9.79875, 33.108, 45.2172, 29.574, 50.778, 19.9485, 9.79875, 
    245.388, 163.592, 18.2373, 19.5, 24.75195, 24.75195, 18.6744, 
    191.1708), PGR_MIN_PRICE = c(171.936225, 114.62415, 162.0432, 
    124.925, 166.41625, 75.944, 141.795, 35.672, 60.28, 9.9325, 
    9.9325, 24.21965, 162.0432, 7.28625, 7.28625, 7.28625, 32.396, 
    43.833, 28.938, 48.048, 19.5195, 7.28625, 239.811, 159.874, 
    17.8451, 14.5, 24.21965, 24.21965, 18.2728, 180.8928), `Ext Sale` = c(157.95, 
    105.3, 138.105, 101.255, 118.875, 64.725, 158.235, 29.36, 
    51.375, 8.5625, 8.5625, 18.63, 138.105, 6.2825, 6.2825, 6.2825, 
    27.412, 34.605, 22.26, 46.737, 14.18, 6.2825, 181.2525, 120.835, 
    12.97, 12.5, 18.63, 18.63, 14.056, 148.0032), `Ext Total Cost` = c(103.161735, 
    68.77449, 56.397372, 46.24365, 61.282, 29.53287, 62.27787, 
    14.06988, 26.0175, 2.08635, 2.08635, 6.71425, 56.397372, 
    1.56715, 1.56715, 1.56715, 13.493, 14.317644, 7.7503, 15.74988, 
    5.1595, 1.56715, 67.1673, 44.7782, 5.1628, 3.29996, 6.29465, 
    6.29465, 8.735932, 52.919604), cust_part = c("AK001000111239190", 
    "AK001000111239190", "AK001000155692", "AK0010001400050-131280", 
    "AK00100010700535", "AK00100010800758", "AK001000169970", 
    "AK00100010445050", "AK00100010819092", "AK00100010115160", 
    "AK00100010115160", "AK00100010703316", "AK001000155692", 
    "AK00100010115157", "AK00100010115157", "AK00100010115157", 
    "AK0010001440704-131280", "AK001000155618", "AK00100010703318", 
    "AK00100010205428", "AK00100010702119", "AK00100010115157", 
    "AK0010001465278", "AK0010001465278", "AK00100010702121", 
    "AK001000115166", "AK00100010703315", "AK00100010703315", 
    "AK00100010715718", "AK00100010243945")), .Names = c("INVOICE_DT", 
"BRANCH_CODE", "INVOICE_NO", "INV_SEQ_NO", "INV_ITEM_ID", "ITEM_DESCR", 
"STD_ITEM", "PRIVATE_LABEL", "CATEGORY_PATH1", "CATEGORY_PATH2", 
"CUST_ID", "CUSTOMER_TIER", "IS_VENDING", "SALE_PRICE", "TOTAL_COST", 
"POS_COST", "CE100", "CE110", "CE120", "CE200", "CORP_PRICE", 
"QTY_SOLD", "PACKSLIP_WHSL", "PRICING_GROUP", "PGG_MIN_PRICE", 
"PGY_MIN_PRICE", "PGR_MIN_PRICE", "Ext Sale", "Ext Total Cost", 
"cust_part"), row.names = c(NA, 
30L), class = c("data.table", "data.frame")))

I have also tried running the script by INV_ITEM_ID as well but the script does not give back only distinct values.

I have been unable to summarize by INV_ITEM_ID.

Frank
  • 66,179
  • 8
  • 96
  • 180
J fast
  • 53
  • 8
  • 1
    Next time, please write a minimal example -- we probably don't need to see your real script or data and could get a better sense of the problem if you made a new reproducible example. Anyway, the error says it cannot sum INV_ITEM_ID, which makes sense. To summarise by multiple vars, you can do `by=.(var1,var2)` like `temptable[ , lapply(.SD, sum), by = .(INV_ITEM_ID, quarter(INVOICE_DT)), .SDcols = c("Ext Sale", "Ext Total Cost", "CE100", "CE110","CE120","QTY_SOLD","PACKSLIP_WHSL")]` not sure if that answers it. – Frank Jun 29 '18 at 18:58
  • Frank, that actually solves the first of my issues. But the biggest problem I am having is that when it returns the INV_ITEM_ID values it is not actually summarized. It gives me 5,135,153 rows with 933,049 distinct values. What I am looking to do is have it summarized down to the 933,049 rows, and then have that summarized by quarter – J fast Jun 29 '18 at 19:33
  • Ok, if you want each distinct value on a row and quarters + vars on columns (?), you can take the result of the last command and pass it through `dcast(res, INV_ITEM_ID ~ quarter, value.var=c("Ext Sale", "Ext Total Cost", "CE100", "CE110","CE120","QTY_SOLD","PACKSLIP_WHSL"))` though I guess that might be quite unreadable. A shorter version: `dcast(temptable, INV_ITEM_ID ~ quarter(INVOICE_DT), fun = sum, value.var = val_cols)` – Frank Jun 29 '18 at 19:49
  • Frank, I guess I am not sure I fully understand that. I am wondering why since I would by summarizing by INV_ITEM_ID that it is not combining sales info into that grouping of each one. I don't know why it shows multiple rows instead of summing them together – J fast Jun 29 '18 at 19:54
  • Ok, you might need to clarify what output you expect, corresponding to the 30-row example provided. Some guidance: https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example/28481250#28481250 – Frank Jun 29 '18 at 19:57
  • For an example say I would have 3 rows of the same INV_ITEM_ID # Let's say 1234 for example and I have 3 different Ext Sale Values of 2, 4, 6. Why is my output not combining those number and giving me 1234 = 12 instead of displaying it 3 different times. – J fast Jun 29 '18 at 20:01
  • I'd suggest working from that example -- `DT = data.table(ID = c(2, 2, 2), value = c(2, 4, 6)); DT[, lapply(.SD, sum), by=ID]` -- which behaves as expected, up to your true use case (by making more complicated examples until you see the problem). Along the way, you might figure it out yourself, and if not, it would make a good question. (Btw, you can edit your question if you want to change it at some point.) – Frank Jun 29 '18 at 20:13
  • Hey Frank I really appreciate the help. Do you think maybe I would maybe need to take the distinct values from that column and create a new one and use that? – J fast Jun 29 '18 at 20:26
  • Frank after working with this a bit i am thinking that i cannot get them to sum correctly because its still pulling from the original data source and then summarizing and then just pulling those columns out. I am wondering if you could help me manipulate my code with using the above so that I can use a new table – J fast Jul 12 '18 at 17:58
  • Sorry, I don't really understand the nature of the problem. I would need to see a concrete example, though maybe others will be able to figure it out from what you've said already. – Frank Jul 12 '18 at 19:20
  • Frank using your above example "DT = data.table(ID = c(2, 2, 2), value = c(2, 4, 6)); DT[, lapply(.SD, sum), by=ID]" how would i do this from a table with multiple variables and only wanted to do them by specifics – J fast Jul 13 '18 at 16:57

1 Answers1

0

I would suggest that you first create another column in your data to represent the quarter of INVOICE_DT:

temptable[,INVOICE_Q := quarter(INVOICE_DT)]

Next run the summation operation by INVOICE_Q and INV_ITEM_ID:

temptable[,lapply(X = .SD,FUN = sum), by = c("INV_ITEM_ID","INVOICE_Q"), 
           .SDcols = c("Ext Sale", "Ext Total Cost", "CE100", "CE110","CE120","QTY_SOLD","PACKSLIP_WHSL")]

You can then remove the created INVOICE_Q column if required:

temptable[,INVOICE_Q := NULL]

Hope that helps!

Rage
  • 323
  • 1
  • 13
  • Error in rbindlist(list(mastertable, temptable), use.names = TRUE) : Item 2 has 36 columns, inconsistent with item 1 which has 30 columns. If instead you need to fill missing columns, use set argument 'fill' to TRUE. – J fast Jul 10 '18 at 16:48
  • OK let me recheck it. – Rage Jul 11 '18 at 06:03
  • I ran the same code for your data and had no errors. Are you using the same dataset you posted in your question? I think you are trying to bind another dataset to temptable which is causing this issue. I don't see any mastertable mentioned on your post. – Rage Jul 11 '18 at 06:12
  • Yes the temptable is a created table from another that the data pulls from – J fast Jul 11 '18 at 12:45
  • Is there a way to work with that? I can show you my full code if that helps – J fast Jul 11 '18 at 12:45
  • Rage, i have just been manipulating code that was already created from someone else. I am thinking the mastertable was so that we could pull by date and then the temptable was for taking only columns wanted. That may or may not make sense. I am wondering if it is not summarizing correctly for me because of how I have the ColClasses set up – J fast Jul 11 '18 at 15:18
  • if (read_this_file) { print(Sys.time()-starttime) print(paste0("Reading in ", in_filenames[i])) temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses = c(CUSTOMER_TIER = "character"), na.strings = "") – J fast Jul 11 '18 at 15:18
  • Well in that case you can pull the data from your master table separately and create the temptable. Just make sure that the data structure of temptable is the same as the example in your post and the colnames should be exactly the same. In case that doesn't help, just share the structure of the mastertable as well. Will give me a better idea of what you're trying to do. – Rage Jul 13 '18 at 05:24