3

I have a dataset containing purchases made by different households across different retailers. For eg

Example Dataset

Using dput()

structure(list(household_code = c(76, 76, 76, 76, 76, 76, 76, 
76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 126, 
126, 126, 126, 126, 126, 126, 126, 126), trip_code_uc = c(1032497498L, 
1025776063L, 1029419047L, 1030418100L, 1029502602L, 1034153056L, 
1027035051L, 1027533991L, 1033515804L, 1032998207L, 1032066227L, 
1028192785L, 1033419039L, 1028730296L, 1027388499L, 1030652869L, 
1025638394L, 1034032718L, 1034032718L, 1025678520L, 1029490031L, 
1029898838L, 1028024134L, 1030324171L, 1031983761L, 1031983761L, 
1033767148L, 1023953965L, 1030954113L, 1030954113L, 1027392968L
), purchase_date = structure(c(1L, 2L, 23L, 50L, 52L, 74L, 77L, 
94L, 148L, 158L, 176L, 179L, 196L, 211L, 224L, 246L, 271L, 286L, 
286L, 309L, 329L, 346L, 2L, 9L, 46L, 46L, 50L, 58L, 66L, 66L, 
68L), .Label = c("2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04", 
"2012-01-05", "2012-01-06", "2012-01-07", "2012-01-08", "2012-01-09", 
"2012-01-10", "2012-01-11", "2012-01-12", "2012-01-13", "2012-01-14", 
"2012-01-15", "2012-01-16", "2012-01-17", "2012-01-18", "2012-01-19", 
"2012-01-20", "2012-01-21", "2012-01-22", "2012-01-23", "2012-01-24", 
"2012-01-25", "2012-01-26", "2012-01-27", "2012-01-28", "2012-01-29", 
"2012-01-30", "2012-01-31", "2012-02-01", "2012-02-02", "2012-02-03", 
"2012-02-04", "2012-02-05", "2012-02-06", "2012-02-07", "2012-02-08", 
"2012-02-09", "2012-02-10", "2012-02-11", "2012-02-12", "2012-02-13", 
"2012-02-14", "2012-02-15", "2012-02-16", "2012-02-17", "2012-02-18", 
"2012-02-19", "2012-02-20", "2012-02-21", "2012-02-22", "2012-02-23", 
"2012-02-24", "2012-02-25", "2012-02-26", "2012-02-27", "2012-02-28", 
"2012-02-29", "2012-03-01", "2012-03-02", "2012-03-03", "2012-03-04", 
"2012-03-05", "2012-03-06", "2012-03-07", "2012-03-08", "2012-03-09", 
"2012-03-10", "2012-03-11", "2012-03-12", "2012-03-13", "2012-03-14", 
"2012-03-15", "2012-03-16", "2012-03-17", "2012-03-18", "2012-03-19", 
"2012-03-20", "2012-03-21", "2012-03-22", "2012-03-23", "2012-03-24", 
"2012-03-25", "2012-03-26", "2012-03-27", "2012-03-28", "2012-03-29", 
"2012-03-30", "2012-03-31", "2012-04-01", "2012-04-02", "2012-04-03", 
"2012-04-04", "2012-04-05", "2012-04-06", "2012-04-07", "2012-04-08", 
"2012-04-09", "2012-04-10", "2012-04-11", "2012-04-12", "2012-04-13", 
"2012-04-14", "2012-04-15", "2012-04-16", "2012-04-17", "2012-04-18", 
"2012-04-19", "2012-04-20", "2012-04-21", "2012-04-22", "2012-04-23", 
"2012-04-24", "2012-04-25", "2012-04-26", "2012-04-27", "2012-04-28", 
"2012-04-29", "2012-04-30", "2012-05-01", "2012-05-02", "2012-05-03", 
"2012-05-04", "2012-05-05", "2012-05-06", "2012-05-07", "2012-05-08", 
"2012-05-09", "2012-05-10", "2012-05-11", "2012-05-12", "2012-05-13", 
"2012-05-14", "2012-05-15", "2012-05-16", "2012-05-17", "2012-05-18", 
"2012-05-19", "2012-05-20", "2012-05-21", "2012-05-22", "2012-05-23", 
"2012-05-24", "2012-05-25", "2012-05-26", "2012-05-27", "2012-05-28", 
"2012-05-29", "2012-05-30", "2012-05-31", "2012-06-01", "2012-06-02", 
"2012-06-03", "2012-06-04", "2012-06-05", "2012-06-06", "2012-06-07", 
"2012-06-08", "2012-06-09", "2012-06-10", "2012-06-11", "2012-06-12", 
"2012-06-13", "2012-06-14", "2012-06-15", "2012-06-16", "2012-06-17", 
"2012-06-18", "2012-06-19", "2012-06-20", "2012-06-21", "2012-06-22", 
"2012-06-23", "2012-06-24", "2012-06-25", "2012-06-26", "2012-06-27", 
"2012-06-28", "2012-06-29", "2012-06-30", "2012-07-01", "2012-07-02", 
"2012-07-03", "2012-07-04", "2012-07-05", "2012-07-06", "2012-07-07", 
"2012-07-08", "2012-07-09", "2012-07-10", "2012-07-11", "2012-07-12", 
"2012-07-13", "2012-07-14", "2012-07-15", "2012-07-16", "2012-07-17", 
"2012-07-18", "2012-07-19", "2012-07-20", "2012-07-21", "2012-07-22", 
"2012-07-23", "2012-07-24", "2012-07-25", "2012-07-26", "2012-07-27", 
"2012-07-28", "2012-07-29", "2012-07-30", "2012-07-31", "2012-08-01", 
"2012-08-02", "2012-08-03", "2012-08-04", "2012-08-05", "2012-08-06", 
"2012-08-07", "2012-08-08", "2012-08-09", "2012-08-10", "2012-08-11", 
"2012-08-12", "2012-08-13", "2012-08-14", "2012-08-15", "2012-08-16", 
"2012-08-17", "2012-08-18", "2012-08-19", "2012-08-20", "2012-08-21", 
"2012-08-22", "2012-08-23", "2012-08-24", "2012-08-25", "2012-08-26", 
"2012-08-27", "2012-08-28", "2012-08-29", "2012-08-30", "2012-08-31", 
"2012-09-01", "2012-09-02", "2012-09-03", "2012-09-04", "2012-09-05", 
"2012-09-06", "2012-09-07", "2012-09-08", "2012-09-09", "2012-09-10", 
"2012-09-11", "2012-09-12", "2012-09-13", "2012-09-14", "2012-09-15", 
"2012-09-16", "2012-09-17", "2012-09-18", "2012-09-19", "2012-09-20", 
"2012-09-21", "2012-09-22", "2012-09-23", "2012-09-24", "2012-09-25", 
"2012-09-26", "2012-09-27", "2012-09-28", "2012-09-29", "2012-09-30", 
"2012-10-01", "2012-10-02", "2012-10-03", "2012-10-04", "2012-10-05", 
"2012-10-06", "2012-10-07", "2012-10-08", "2012-10-09", "2012-10-10", 
"2012-10-11", "2012-10-12", "2012-10-13", "2012-10-14", "2012-10-15", 
"2012-10-16", "2012-10-17", "2012-10-18", "2012-10-19", "2012-10-20", 
"2012-10-21", "2012-10-22", "2012-10-23", "2012-10-24", "2012-10-25", 
"2012-10-26", "2012-10-27", "2012-10-28", "2012-10-29", "2012-10-30", 
"2012-10-31", "2012-11-01", "2012-11-02", "2012-11-03", "2012-11-04", 
"2012-11-05", "2012-11-06", "2012-11-07", "2012-11-08", "2012-11-09", 
"2012-11-10", "2012-11-11", "2012-11-12", "2012-11-13", "2012-11-14", 
"2012-11-15", "2012-11-16", "2012-11-17", "2012-11-18", "2012-11-19", 
"2012-11-20", "2012-11-21", "2012-11-22", "2012-11-23", "2012-11-24", 
"2012-11-25", "2012-11-26", "2012-11-27", "2012-11-28", "2012-11-29", 
"2012-11-30", "2012-12-01", "2012-12-02", "2012-12-03", "2012-12-04", 
"2012-12-05", "2012-12-06", "2012-12-07", "2012-12-08", "2012-12-09", 
"2012-12-10", "2012-12-11", "2012-12-12", "2012-12-13", "2012-12-14", 
"2012-12-15", "2012-12-16", "2012-12-17", "2012-12-18", "2012-12-19", 
"2012-12-20", "2012-12-21", "2012-12-22", "2012-12-23", "2012-12-24", 
"2012-12-25", "2012-12-26", "2012-12-27", "2012-12-28", "2012-12-29"
), class = "factor"), retailer_code = c(11024, 11024, 11024, 
11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024, 
11024, 11024, 11024, 11024, 2353, 11024, 11024, 2353, 11024, 
11024, 63882, 650, 89960, 89960, 650, 89960, 89960, 89960, 650
), Overall_Brand = structure(c(19L, 74L, 19L, 48L, 19L, 48L, 
19L, 19L, 19L, 48L, 48L, 31L, 46L, 31L, 31L, 48L, 74L, 31L, 74L, 
19L, 31L, 19L, 48L, 48L, 31L, 31L, 48L, 31L, 31L, 48L, 48L), .Label = c("ABUNDANCE", 
"ALPEN", "AMERICAN BREAKFAST ", "ANNIE'S HOMEGROWN", "ARWHD MLS", 
"BARBARA'S", "BEAR NAKED", "BEAR RIVER", "BOB'S RED MILL", "BOKOMO COUNTRY", 
"BREAKFAST CHOICE", "BREAKFAST ZONE", "BROOKFARM MACADAMIA ", 
"BRUGGEN", "BUCKEYE HEROES", "CADIA", "CASCADIAN FARM ", "CHOCOLATE SPOONERS", 
"CTL BR", "DORSET", "ENJOY LIFE PERKY'S CRUNCHY FLX", "EREWHON", 
"F-FACTOR", "FAMILIA", "FIELD DAY", "FINAX", "FLEURY FLAKES MARC ANDRE", 
"FOOD FOR LIFE EZEKIEL 50", "FORRELLI ", "GEFEN KING", "GENERAL MILLS", 
"GERONIMO PEYTON HILLIS REDZONE", "GLUCERNA", "GLUTINO", "GOLDEN FOODS", 
"GRANVITA PUFFY'S", "GREENBRIER INT INC-NBL CRN FLK", "HEALTH VALLEY", 
"HODGSON MILL", "HOME FAVORITE", "HOSTESS TOASTED OATS", "HSP", 
"ISABEL'S WAY ", "JASPER", "JUSTIN VERLANDER'S FASTBLL FLK", 
"KASHI", "KAY'S NATURALS BETTER BALANCE", "Kellogg", "KIND", 
"KOZY SHACK READY GRAINS", "KRETSCHMER", "LADY LIBERTY", "LIEBER'S", 
"LIVING INTENTIONS SPRFD CRL", "LOVE GROWN FOODS", "MAIZORO", 
"MANISCHEWITZ", "MILL SELECT ", "MOTHER'S", "MULTIGRAIN SPNRS", 
"NASH BROTHERS", "NATURE'S PATH", "NESTLE", "NEW ENGLAND NATURALS", 
"NEWMAN'S OWN SWEET ENGH WHT PF", "NUTRISYSTEM NOURISH", "NUTRITIOUS LIVING", 
"PAMPA ", "PASKESZ CHOCO RIOS", "PBLC LB BR-NBL HY NT TSD OT DM", 
"PEACE", "POST", "PROTEIN PLUS CORN FLAKE", "QUAKER", "RALSTON", 
"RLSTN", "SALLY'S ", "SCRUMPTIOUS SPELNDID CRLS CBB", "SEITENBACHER MUESLI", 
"SIMPLY FIBER", "SKINNER'S RAISIN BRAN", "SORIANA WHOLE BRAN", 
"STREIT'S FRUIT & NUT MUESLI", "SUNBELT", "SWAD", "T. ABRAHAM'S", 
"TAANUG CORN FLAKES", "TASTY", "TEMMY'S", "THE", "THREE SISTERS", 
"TIKISS SWEETENED PUFFD WHL WHT", "TRU ROOTS", "VOTTO'S", "WEETABIX", 
"WHOLESOME GOODNESS", "WILD ROOTS ANCIENT ORIGINS", "WONDER CORN FLAKES", 
"YOG ACTIVE"), class = "factor")), row.names = c(90609L, 222436L, 
90606L, 688592L, 90607L, 688593L, 90605L, 90604L, 90608L, 668330L, 
321377L, 567447L, 945385L, 567445L, 567443L, 892854L, 583186L, 
567446L, 583185L, 168138L, 567444L, 60086L, 698120L, 698127L, 
3933L, 809409L, 698129L, 15286L, 15284L, 698116L, 319455L), class = "data.frame")

I have a separate variable for Household, trip ID for a particular purchase instance, retailer, and Brand they purchased. One household may purchase more than 1 brand in 1 trip. I wanted to calculate the repeat purchases in 2 consecutive trips. For example, if a household purchase General Mills and Kellogg both in trip 1 and only General mill in 2nd trip, for general mills, it will be a repeat.

Example output:

Example Output

Now, it is easy when I have only 1 purchase per trip. I do it by the following code using rle()

e1 = transform(e1, brand_last_dum = ave(as.character(Brand), rleid(Household, Brand), FUN = seq_along))

However, when there is more than 1 purchase, it doesn't work. Data is arranged by the purchase date and more than 1 purchase of the same brand in a trip maybe considered as a different purchase for calculating repeat in the next row. Please Help

3 Answers3

3

2020-11-29 UPDATE: Adjusted code to count second+ purchases of same brand in same shopping trip as repeat purchases. Given the additional requirement I was able to eliminate one of the joins which had been previously added to treat all purchases of a given brand within a trip consistently.

library(dplyr)
data %>%
        arrange(.,household_code,purchase_date,trip_code_uc) %>% 
        group_by(household_code,purchase_date) %>%
        distinct(trip_code_uc) %>% 
        group_by(household_code) %>% 
        mutate(trip_seq = seq_along(household_code)) %>% 
        ungroup() %>% 
        left_join(data,.) %>%
        arrange(household_code,Overall_Brand,trip_seq) %>%
        mutate(repeat_purchase = if_else(household_code == lag(household_code) &
                                                 Overall_Brand == lag(Overall_Brand) & 
                                                 (trip_seq == lag(trip_seq) + 1 | trip_seq == lag(trip_seq)),
                                         TRUE,FALSE,missing=FALSE)) %>%
        arrange(.,household_code,purchase_date,Overall_Brand) -> result

print(as.data.frame(result[,c(1,3,5,6,7)]))

...and the output:

> print(as.data.frame(result[,c(1,3,5,6,7)]))
   household_code purchase_date Overall_Brand trip_seq repeat_purchase
1              76    2012-01-01        CTL BR        1           FALSE
2              76    2012-01-02        QUAKER        2           FALSE
3              76    2012-01-23        CTL BR        3           FALSE
4              76    2012-02-19       Kellogg        4           FALSE
5              76    2012-02-21        CTL BR        5           FALSE
6              76    2012-03-14       Kellogg        6           FALSE
7              76    2012-03-17        CTL BR        7           FALSE
8              76    2012-04-03        CTL BR        8            TRUE
9              76    2012-05-27        CTL BR        9            TRUE
10             76    2012-06-06       Kellogg       10           FALSE
11             76    2012-06-24       Kellogg       11            TRUE
12             76    2012-06-27 GENERAL MILLS       12           FALSE
13             76    2012-07-14         KASHI       13           FALSE
14             76    2012-07-29 GENERAL MILLS       14           FALSE
15             76    2012-08-11 GENERAL MILLS       15            TRUE
16             76    2012-09-02       Kellogg       16           FALSE
17             76    2012-09-27        QUAKER       17           FALSE
18             76    2012-10-12 GENERAL MILLS       18           FALSE
19             76    2012-10-12        QUAKER       18            TRUE
20             76    2012-11-04        CTL BR       19           FALSE
21             76    2012-11-24 GENERAL MILLS       20           FALSE
22             76    2012-12-11        CTL BR       21           FALSE
23            126    2012-01-02       Kellogg        1           FALSE
24            126    2012-01-09       Kellogg        2            TRUE
25            126    2012-02-15 GENERAL MILLS        3           FALSE
26            126    2012-02-15 GENERAL MILLS        3            TRUE
27            126    2012-02-19       Kellogg        4           FALSE
28            126    2012-02-27 GENERAL MILLS        5           FALSE
29            126    2012-03-06 GENERAL MILLS        6            TRUE
30            126    2012-03-06       Kellogg        6           FALSE
31            126    2012-03-08       Kellogg        7            TRUE
> 

Prior version included below to maintain relevance of comments

2020-11-28 UPDATE: After receiving the updated data that includes purchase_date, we altered our solution to use this information to generate trip_seq. We assume that if there are multiple trip_code_uc values in a single day, the shopping experiences occur in ascending order of trip_code_uc.

We use dplyr to define distinct trip identifiers for each household and assign a trip sequence. We then use the trip sequence to evaluate whether the same brand was purchased in two consecutive shopping trips after merging it with the original data.

library(dplyr)
data %>%
     arrange(.,household_code,purchase_date,trip_code_uc) %>% 
     group_by(household_code,purchase_date) %>%
     distinct(trip_code_uc) %>% 
     group_by(household_code) %>% 
     mutate(trip_seq = seq_along(household_code)) %>% 
     ungroup() %>% 
     left_join(data,.) %>% 
     group_by(household_code,purchase_date,trip_seq) %>%
     distinct(Overall_Brand) %>%
     ungroup() %>%
     arrange(household_code,Overall_Brand,purchase_date,trip_seq) %>%
     mutate(repeat_purchase = if_else(household_code == lag(household_code) &
                                           Overall_Brand == lag(Overall_Brand) & 
                                           trip_seq == lag(trip_seq) + 1,
                                      TRUE,FALSE,missing=FALSE)) %>%
     left_join(data,.) -> result
result <- arrange(result,household_code,purchase_date,Overall_Brand)
print(as.data.frame(result[,c(1,3,5,6,7)]))

...and the output:

> print(as.data.frame(result[,c(1,3,5,6,7)]))
   household_code purchase_date Overall_Brand trip_seq repeat_purchase
1              76    2012-01-01        CTL BR        1           FALSE
2              76    2012-01-02        QUAKER        2           FALSE
3              76    2012-01-23        CTL BR        3           FALSE
4              76    2012-02-19       Kellogg        4           FALSE
5              76    2012-02-21        CTL BR        5           FALSE
6              76    2012-03-14       Kellogg        6           FALSE
7              76    2012-03-17        CTL BR        7           FALSE
8              76    2012-04-03        CTL BR        8            TRUE
9              76    2012-05-27        CTL BR        9            TRUE
10             76    2012-06-06       Kellogg       10           FALSE
11             76    2012-06-24       Kellogg       11            TRUE
12             76    2012-06-27 GENERAL MILLS       12           FALSE
13             76    2012-07-14         KASHI       13           FALSE
14             76    2012-07-29 GENERAL MILLS       14           FALSE
15             76    2012-08-11 GENERAL MILLS       15            TRUE
16             76    2012-09-02       Kellogg       16           FALSE
17             76    2012-09-27        QUAKER       17           FALSE
18             76    2012-10-12 GENERAL MILLS       18           FALSE
19             76    2012-10-12        QUAKER       18            TRUE
20             76    2012-11-04        CTL BR       19           FALSE
21             76    2012-11-24 GENERAL MILLS       20           FALSE
22             76    2012-12-11        CTL BR       21           FALSE
23            126    2012-01-02       Kellogg        1           FALSE
24            126    2012-01-09       Kellogg        2            TRUE
25            126    2012-02-15 GENERAL MILLS        3           FALSE
26            126    2012-02-15 GENERAL MILLS        3           FALSE
27            126    2012-02-19       Kellogg        4           FALSE
28            126    2012-02-27 GENERAL MILLS        5           FALSE
29            126    2012-03-06 GENERAL MILLS        6            TRUE
30            126    2012-03-06       Kellogg        6           FALSE
31            126    2012-03-08       Kellogg        7            TRUE

The sequence number is important because if we simply sort by Household, Brand and Trip ID, we can't tell whether the next trip number is truly the "next" purchase, as illustrated by Household 126 purchases of the Kellog Brand, where it is purchased on trip sequences 1, 2, 4, 6, and 7. Only purchases on trips 2 and 7 should be counted as repeat purchases in consecutive trips, per the request in the OP.

Len Greski
  • 10,505
  • 2
  • 22
  • 33
  • Hi @Len, there is a slight confusion I think in the sequence of the trips. For eg, Household 126, didn't buy Kellogg on 2,3,4,5, and 7 rather on 1,2,5,8,9. I think by mistake, you have ordered the data by trip ID. You were right in saying that sequence no is important and it should be the sequence in which data is arranged in the example. Also, if a Brand is purchased twice on the same trip, we may consider it as different purchase – Rajan Mishra Nov 29 '20 at 00:24
  • @RajanMishra - hello Rajan. I sorted the data by `Household` and `Trip ID` in order to create the `trip_seq` column, so ordering the data by `Trip ID` was deliberate, and not a mistake. Are you saying there is some other mechanism to determine which trips happened in order? It's not clear from the information provided in your question why ascending order of `Trip ID` is not a valid way to sort the trips. Please add context to your question and I can adjust my answer accordingly. – Len Greski Nov 29 '20 at 00:27
  • Hi @Len, actually the data is already sorted by the purchase dates and Trip ID is thus not a valid ID to sort rather it is the purchase dates. It was my mistake to not add the purchase dates column. I have added it to the data example. I have added it now in the example data to make it clearer. And, also, thank you so much for trying to help me out – Rajan Mishra Nov 29 '20 at 00:46
  • Thanks @RajanMishra. I'll change my code to use `purchase_date` and `trip_code_uc` to create the sequential trip IDs. What should happen when there are two shopping trips on the same day? In that case is it OK to sort by ascending `trip_code_uc`? – Len Greski Nov 29 '20 at 02:12
  • @RajanMishra - my answer has been updated, and it correctly classifies row 19 as a repeat purchase. – Len Greski Nov 29 '20 at 03:12
  • Hi @Len, I think it is almost there. The Only issue that remained is instances when households made 2 purchases of the same brand. More than 1 purchase of the same brand on a trip is to be considered as a different purchase for calculating repeat. For eg in row number 25 and 26, 2 instances of GM is there, so, one of them would be considered as a repeat. – Rajan Mishra Nov 29 '20 at 07:13
  • @RajanMishra - based on the rules you provided in the question, a brand is only to be counted as a repeat purchase if it was purchased in the prior shopping trip. Therefore, row 26 is not a repeat, since the purchase happened during the same shopping trip as row 25. – Len Greski Nov 29 '20 at 19:20
  • HI @Len, actually I mentioned that in the description that "more than 1 purchase of the same brand in a trip maybe considered as a different purchase for calculating repeat". However, I missed that in the output image. IF you could please consider that, it would be great. – Rajan Mishra Nov 29 '20 at 19:33
  • Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/225295/discussion-between-len-greski-and-rajan-mishra). – Len Greski Nov 29 '20 at 20:13
2

On second thought, I think things can be much easier if we introduce a function like this

conditional_count <- function(x, fcond) {
  fcond <- rlang::as_function(fcond)
  x <- fcond(x)
  x[x] <- with(rle(x), sequence(lengths[values]))
  x
}

fcond needs to be a function that applies to x and returns only TRUE or FALSE. The output of this function is a sequence for each streak of TRUEs and 0 otherwise. For example,

> conditional_count(c(1,1,2,3,3,3,5), duplicated)
[1] 0 1 0 0 1 2 0

We can then achieve what you want in two steps. First, generate a rleid for each trip. Then, for each Household and Brand, we count each trip id that is exactly the same as or just one plus the id for the previous trip. The first condition indicates that one trip has two purchases of the same brand. The second condition indicates that the same brand is found on the last trip. Both suggest that the brand is a repeat as per your requirements. For example, if a brand occurs for trips 1,1,2,3,3,5,6, then the count will be 0,1,2,3,4,0,1. Then, the code is just

library(dplyr)

e1 %>% 
  group_by(Household) %>% 
  mutate(trip_seq = rleid(`Trip ID`)) %>% 
  group_by(Household, Brand) %>% 
  mutate(
    Repeat = conditional_count(trip_seq, ~. - lag(., default = -1L) < 2L),
    trip_seq = NULL
  )

Output

# A tibble: 31 x 5
# Groups:   Household, Brand [7]
   Household  `Trip ID` Retailer Brand         Repeat
       <dbl>      <int>    <dbl> <fct>          <int>
 1        76 1032497498    11024 CTL BR             0
 2        76 1025776063    11024 QUAKER             0
 3        76 1029419047    11024 CTL BR             0
 4        76 1030418100    11024 Kellogg            0
 5        76 1029502602    11024 CTL BR             0
 6        76 1034153056    11024 Kellogg            0
 7        76 1027035051    11024 CTL BR             0
 8        76 1027533991    11024 CTL BR             1
 9        76 1033515804    11024 CTL BR             2
10        76 1032998207    11024 Kellogg            0
11        76 1032066227    11024 Kellogg            1
12        76 1028192785    11024 GENERAL MILLS      0
13        76 1033419039    11024 KASHI              0
14        76 1028730296    11024 GENERAL MILLS      0
15        76 1027388499    11024 GENERAL MILLS      1
16        76 1030652869    11024 Kellogg            0
17        76 1025638394     2353 QUAKER             0
18        76 1034032718    11024 GENERAL MILLS      0
19        76 1034032718    11024 QUAKER             1
20        76 1025678520     2353 CTL BR             0
21        76 1029490031    11024 GENERAL MILLS      0
22        76 1029898838    11024 CTL BR             0
23       126 1028024134    63882 Kellogg            0
24       126 1030324171      650 Kellogg            1
25       126 1031983761    89960 GENERAL MILLS      0
26       126 1031983761    89960 GENERAL MILLS      1
27       126 1033767148      650 Kellogg            0
28       126 1023953965    89960 GENERAL MILLS      0
29       126 1030954113    89960 GENERAL MILLS      1
30       126 1030954113    89960 Kellogg            0
31       126 1027392968      650 Kellogg            1
ekoam
  • 8,744
  • 1
  • 9
  • 22
  • Hi @ekoam, If you see row 17 and 19 (trips containing the purchase of Quaker), in row 19, it should be treated as a repeated purchase as it was bought on the last trip (1025638394) also apart from purchased in the next trip (1034032718). So, I have to compare the shopping instances of different trips. – Rajan Mishra Nov 28 '20 at 22:58
  • Hi @ekoam, I think it is almost there. The Only issue that remained is instances when households made 2 purchases of the same brand. More than 1 purchase of the same brand on a trip is to be considered as a different purchase for calculating repeat. For eg in row number 25 and 26, 2 instances of GM is there, so, one of them would be considered as a repeat. – Rajan Mishra Nov 29 '20 at 07:14
  • But your expected output does not say so. For instance, see Trip ID 1031983761 in that picture. Could you please explain that? @RajanMishra – ekoam Nov 29 '20 at 07:28
  • Ohh. Yes. So sorry. that was my mistake while putting the values. Actually, that should be 1 in 2nd instance, I wrote it in the description but forgot to put in there. – Rajan Mishra Nov 29 '20 at 07:48
  • HI @ekoam. I tried to run it. However, it is having the following error - Error: Column `Repeat` can't be converted from logical to integer – Rajan Mishra Nov 29 '20 at 19:28
  • @RajanMishra sorry I can't reproduce that error. Perhaps you can try removing `Repeat` in `e1` and then running the code. – ekoam Nov 30 '20 at 01:10
2

Though I have upvoted Len Greski's solution proposed, yet I feel the code can be shortened like this (there is no need to join).

EDIT On second thoughts I am proposing a solution using dense_rank instead of cur_group_id

library(dplyr)

df_result <- df %>% group_by(household_code) %>%
  arrange(purchase_date, trip_code_uc) %>% 
  mutate(shop_id = dense_rank(paste(purchase_date, trip_code_uc)),
         brand_seq = dense_rank(Overall_Brand)) %>%
  group_by(household_code, brand_seq) %>%
  mutate(cond = ifelse(shop_id == lag(shop_id)  | shop_id == lag(shop_id)+1, TRUE, FALSE),
         cond = ifelse(is.na(cond), FALSE, cond)) %>%
  ungroup() %>%
  select(-shop_id, -brand_seq)

#Check for true rows
> df_result %>% filter(cond == T)
# A tibble: 9 x 6
  household_code trip_code_uc purchase_date retailer_code Overall_Brand cond 
           <dbl>        <int> <fct>                 <dbl> <fct>         <lgl>
1            126   1030324171 2012-01-09              650 Kellogg       TRUE 
2            126   1031983761 2012-02-15            89960 GENERAL MILLS TRUE 
3            126   1030954113 2012-03-06            89960 GENERAL MILLS TRUE 
4            126   1027392968 2012-03-08              650 Kellogg       TRUE 
5             76   1027533991 2012-04-03            11024 CTL BR        TRUE 
6             76   1033515804 2012-05-27            11024 CTL BR        TRUE 
7             76   1032066227 2012-06-24            11024 Kellogg       TRUE 
8             76   1027388499 2012-08-11            11024 GENERAL MILLS TRUE 
9             76   1034032718 2012-10-12            11024 QUAKER        TRUE 

Note That I have generated shop_id keeping in mind that two different purchases can be made by one customer on one day.

Earlier proposed solution

library(tidyverse)

df %>% group_by(household_code, purchase_date, trip_code_uc) %>%
  arrange(purchase_date, trip_code_uc) %>% 
  mutate(shop_id = cur_group_id()) %>%
  group_by(household_code, Overall_Brand) %>%
  mutate(brand_seq = cur_group_id()) %>%
  group_by(household_code, brand_seq) %>%
  mutate(cond = ifelse(shop_id == lag(shop_id)  | shop_id == lag(shop_id)+1, TRUE, FALSE),
         cond = ifelse(is.na(cond), FALSE, cond)) %>%
  ungroup() %>%
  select(-shop_id, -brand_seq)

# A tibble: 31 x 6
   household_code trip_code_uc purchase_date retailer_code Overall_Brand cond 
            <dbl>        <int> <fct>                 <dbl> <fct>         <lgl>
 1             76   1032497498 2012-01-01            11024 CTL BR        FALSE
 2             76   1025776063 2012-01-02            11024 QUAKER        FALSE
 3            126   1028024134 2012-01-02            63882 Kellogg       FALSE
 4            126   1030324171 2012-01-09              650 Kellogg       TRUE 
 5             76   1029419047 2012-01-23            11024 CTL BR        FALSE
 6            126   1031983761 2012-02-15            89960 GENERAL MILLS FALSE
 7            126   1031983761 2012-02-15            89960 GENERAL MILLS TRUE 
 8             76   1030418100 2012-02-19            11024 Kellogg       FALSE
 9            126   1033767148 2012-02-19              650 Kellogg       FALSE
10             76   1029502602 2012-02-21            11024 CTL BR        FALSE
# ... with 21 more rows

If we check with TRUE rows only

# A tibble: 9 x 6
  household_code trip_code_uc purchase_date retailer_code Overall_Brand cond 
           <dbl>        <int> <fct>                 <dbl> <fct>         <lgl>
1            126   1030324171 2012-01-09              650 Kellogg       TRUE 
2            126   1031983761 2012-02-15            89960 GENERAL MILLS TRUE 
3            126   1030954113 2012-03-06            89960 GENERAL MILLS TRUE 
4            126   1027392968 2012-03-08              650 Kellogg       TRUE 
5             76   1027533991 2012-04-03            11024 CTL BR        TRUE 
6             76   1033515804 2012-05-27            11024 CTL BR        TRUE 
7             76   1032066227 2012-06-24            11024 Kellogg       TRUE 
8             76   1027388499 2012-08-11            11024 GENERAL MILLS TRUE 
9             76   1034032718 2012-10-12            11024 QUAKER        TRUE 

In this solution, group_ids aren't renumbered at every household. If you specifically want to restart numbering at every houseld, this code may help

df %>% 
  group_split(household_code) %>%
  purrr::map_df(~.x %>% arrange(purchase_date, trip_code_uc) %>%
                  group_by(purchase_date, trip_code_uc) %>% 
                  mutate(shop_id = cur_group_id()) %>%
                  group_by(Overall_Brand) %>%
                  mutate(brand_seq = cur_group_id())) %>%
  group_by(household_code, brand_seq) %>%
  mutate(cond = ifelse(shop_id == lag(shop_id)  | shop_id == lag(shop_id)+1, TRUE, FALSE),
         cond = ifelse(is.na(cond), FALSE, cond)) %>%
  ungroup()

# A tibble: 31 x 8
   household_code trip_code_uc purchase_date retailer_code Overall_Brand shop_id brand_seq cond 
            <dbl>        <int> <fct>                 <dbl> <fct>           <int>     <int> <lgl>
 1             76   1032497498 2012-01-01            11024 CTL BR              1         1 FALSE
 2             76   1025776063 2012-01-02            11024 QUAKER              2         5 FALSE
 3             76   1029419047 2012-01-23            11024 CTL BR              3         1 FALSE
 4             76   1030418100 2012-02-19            11024 Kellogg             4         4 FALSE
 5             76   1029502602 2012-02-21            11024 CTL BR              5         1 FALSE
 6             76   1034153056 2012-03-14            11024 Kellogg             6         4 FALSE
 7             76   1027035051 2012-03-17            11024 CTL BR              7         1 FALSE
 8             76   1027533991 2012-04-03            11024 CTL BR              8         1 TRUE 
 9             76   1033515804 2012-05-27            11024 CTL BR              9         1 TRUE 
10             76   1032998207 2012-06-06            11024 Kellogg            10         4 FALSE
# ... with 21 more rows
AnilGoyal
  • 25,297
  • 4
  • 27
  • 45
  • HI @AnilGoyal, While trying to run your code, I am encountering the following error - Error in cur_group_id() : could not find function "cur_group_id". Also, I mentioned in the earlier answers, "more than 1 purchase of the same brand in a trip may be considered as a different purchase for calculating repeat". However, I missed that in the output image. If you could please consider that, it would be great – Rajan Mishra Nov 29 '20 at 19:34
  • Nice answer, Anil. Upvoted. I keep forgetting about `group_split()` and `map_df()`. I originally included the second join in my answer because I was trying to count distinct occurrences of `brand` within `trip`. Accounting for the requirement to treat 2nd+ purchases of same brand in same shopping trip eliminated need for second join. As noted in your answer, `group_split()` with `map_df()` eliminates it entirely. – Len Greski Nov 29 '20 at 20:26
  • Thanks Len. @Rajan the solution caters to requirement of same shopping trip. Regarding the function `cur_group_id` let me propose an alternate solution, which just came to my mind, shortly. – AnilGoyal Nov 30 '20 at 00:47
  • Please see the edited solution. Both requirements are fulfilled - (i) `cur_group_id` not used, (ii) check for same trip id also. Moreover, I feel this one is shorter therefore I have added this solution on top of my answer. – AnilGoyal Nov 30 '20 at 02:12
  • 1
    Thank you, Anil. Your solution also seems to answer solve the issue. Since I have already used @LenGreski answer for my code, I upvoted your answer. – Rajan Mishra Nov 30 '20 at 08:38