0

My current data is in the below format

ID  ID_2    Item ID Final Amount
001 111 1111    34623
001 111 1112    42567
001 112 1113    1254
001 112 1114    45237
001 112 1115    42913
001 112 1116    28117
001 113 1117    10312
008 222 1118    27367
008 222 1119    24714
008 223 1120    30949
011 333 1121    49529
012 444 1122    29762
012 444 1123    2743
012 444 1124    21357
012 444 1125    16256
012 444 1126    18376
017 555 1127    7877
017 555 1128    10684
017 555 1129    25281

but i want it as sort of a matrix

ID  ID_2    Item_1  Item_2  Item_3  Item_4  Item_5
1   111 34623   42567           
1   112 1254    45237   42913   28117   
1   113 10312               
8   222 27367   24714           
8   223 30949               
11  333 49529               
12  444 29762   2743    21357   16256   18376
17  555 7877    10684   25281       

Can anyone help me with this?

SBista
  • 7,479
  • 1
  • 27
  • 58
  • In base R, you can use `reshape` after creating an item count variable like this: `df1$times <- ave(df1$ItemID, df1$ID, df1$ID_2, FUN=seq_along)` and then reshape wide `reshape(df1, direction="wide", idvar=c("ID", "ID_2"), drop="ItemID", v.names="times")`. – lmo Jun 02 '17 at 12:34
  • Error: cannot allocate vector of size 2.9 Gb – dinesh khemani Jun 06 '17 at 08:10

1 Answers1

1

We can do this by creating a sequence column by 'ID', 'ID_2', which is easier with dcast from data.table

library(data.table)
dcast(setDT(df1), ID + ID_2 ~paste0("Item", rowid(ID, ID_2)), value.var = "FinalAmount")
#    ID ID_2 Item1 Item2 Item3 Item4 Item5
#1:  1  111 34623 42567    NA    NA    NA
#2:  1  112  1254 45237 42913 28117    NA
#3:  1  113 10312    NA    NA    NA    NA
#4:  8  222 27367 24714    NA    NA    NA
#5:  8  223 30949    NA    NA    NA    NA
#6: 11  333 49529    NA    NA    NA    NA
#7: 12  444 29762  2743 21357 16256 18376
#8: 17  555  7877 10684 25281    NA    NA

Or this can be done with tidyverse

library(tidyverse)
df1 %>% 
   group_by(ID, ID_2) %>% 
   mutate(Seq = paste0("Item", row_number())) %>%
   select(-ItemID) %>%
   spread(Seq, FinalAmount)
# A tibble: 8 x 7
# Groups: ID, ID_2 [8]
#     ID  ID_2 Item1 Item2 Item3 Item4 Item5
#* <int> <int> <int> <int> <int> <int> <int>
#1     1   111 34623 42567    NA    NA    NA
#2     1   112  1254 45237 42913 28117    NA
#3     1   113 10312    NA    NA    NA    NA
#4     8   222 27367 24714    NA    NA    NA
#5     8   223 30949    NA    NA    NA    NA
#6    11   333 49529    NA    NA    NA    NA
#7    12   444 29762  2743 21357 16256 18376
#8    17   555  7877 10684 25281    NA    NA

data

df1 <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 8L, 8L, 8L, 
11L, 12L, 12L, 12L, 12L, 12L, 17L, 17L, 17L), ID_2 = c(111L, 
111L, 112L, 112L, 112L, 112L, 113L, 222L, 222L, 223L, 333L, 444L, 
444L, 444L, 444L, 444L, 555L, 555L, 555L), ItemID = 1111:1129, 
FinalAmount = c(34623L, 42567L, 1254L, 45237L, 42913L, 28117L, 
10312L, 27367L, 24714L, 30949L, 49529L, 29762L, 2743L, 21357L, 
16256L, 18376L, 7877L, 10684L, 25281L)), .Names = c("ID", 
"ID_2", "ItemID", "FinalAmount"), class = "data.frame", row.names = c(NA, 
 -19L))
akrun
  • 874,273
  • 37
  • 540
  • 662
  • no need to include `ID` in `rowid` imo; using `dcast(setDT(df1), ID + ID_2 ~ rowid(ID_2, prefix = 'Item'), value.var = "FinalAmount")` works just as well – Jaap Jun 02 '17 at 07:38
  • @Jaap It could work, but it may be required in the full data – akrun Jun 02 '17 at 07:40
  • My data has over 9L rows, so it gives an error Error: cannot allocate vector of size 58.4 Gb. – dinesh khemani Jun 06 '17 at 07:35
  • @dineshkhemani It is because your system doesn't have memory for performing this. You may have to do it on a server if memory is an issue – akrun Jun 06 '17 at 07:38
  • i dont have an option to do it on a server. Can you suggest a solution which can be performed on system memory? Thanks – dinesh khemani Jun 06 '17 at 07:45
  • @dineshkhemani It is difficult as R works on memory. Other options include RevoScaleR, which is again a commercial product – akrun Jun 06 '17 at 07:52