0

I have a data frame that looks something like this

user booking_date origin destination  price  sale_channel
1 user5   2018-11-01    MAD         PMI  58.20        online
2 user7   2018-11-01    DUB         MAD 147.50        online
3 user4   2018-11-02    TFS         MAD  24.05        online
4 user7   2018-11-01    LPA         MAD  37.30   call center
5 user1   2018-11-01    AMS         MAD 149.74 travel agency
6 user1   2018-11-01    MAD         PMI  19.95        online

Now I want to:

  1. Filter by a specific date (2018-11-02) which I've done by
df <- df[df$booking_date == '2018-11-02',]
  1. Group by the user and sum u their total spending on tickets.

I've tried a couple of approaches using group_by or aggregate but all I managed to get was an additional column but not grouping or aggregating the value per user.

  1. My final output would idealy by something enabling me to extract the 10 users that spend the most on tickets like this for example:
'user1' 'user10' 'user 7' etc.

Example Data

structure(list(user = c("user4", "user5", "user3", "user10", 
"user1", "user2", "user7", "user6", "user5", "user6", "user6", 
"user7", "user1", "user7", "user4", "user4", "user1", "user7", 
"user7", "user8", "user4", "user10", "user4", "user8", "user3", 
"user9", "user5", "user2", "user5", "user3", "user3", "user9", 
"user6", "user10", "user9", "user5", "user3", "user5", "user7", 
"user9", "user2", "user2", "user7", "user10", "user7", "user3", 
"user1", "user2", "user8", "user6", "user6", "user10", "user4", 
"user7", "user4", "user1", "user4", "user2", "user1", "user7", 
"user5", "user4", "user4", "user7", "user10"), booking_date = structure(c(17837, 
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 
17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 17837, 
17837), class = "Date"), origin = c("TFS", "MAD", "TFN", "MAD", 
"LYS", "LPA", "MAD", "MAD", "AMS", "MAD", "MAD", "MAD", "MAD", 
"LPA", "MAD", "MAD", "MAD", "TXL", "MAD", "MAD", "TXL", "TXL", 
"TFS", "MAD", "NAP", "TFS", "PMI", "TLS", "TFS", "NTE", "AMS", 
"FUE", "TFN", "CPH", "TFN", "MAD", "SVQ", "SCQ", "SVQ", "MAD", 
"PMI", "MAD", "PMI", "MAD", "MAD", "MAD", "MAD", "MAD", "SVQ", 
"NCE", "CDG", "MAD", "MAD", "MAD", "MAD", "MAD", "TFN", "LGW", 
"LGW", "MAD", "TFN", "MAD", "RNS", "AGP", "CDG"), destination = c("MAD", 
"DUB", "MAD", "TFS", "MAD", "MAD", "NAP", "TLS", "MAD", "SCQ", 
"LPA", "TFN", "TXL", "MAD", "TFN", "SVQ", "ACE", "MAD", "TLS", 
"SCQ", "MAD", "MAD", "MAD", "LPA", "MAD", "MAD", "MAD", "MAD", 
"MAD", "MAD", "MAD", "MAD", "MAD", "MAD", "MAD", "CAG", "MAD", 
"MAD", "MAD", "LPA", "MAD", "CDG", "MAD", "LPA", "TFS", "TFN", 
"PMI", "NAP", "MAD", "MAD", "MAD", "LPA", "LGW", "LPA", "CDG", 
"SPC", "MAD", "MAD", "MAD", "SCQ", "MAD", "SVQ", "MAD", "MAD", 
"MAD"), price = c(24.0499992371, 41.5400009155, 251.199996948, 
15.6000003815, 44.0099983215, 73.8499984741, 115.470001221, 69.4400024414, 
81.3899993896, 15.2399997711, 41.1199989319, 274.559997559, 150, 
29.3199996948, 332.440002441, 94.9100036621, 97.9800033569, 55.1199989319, 
81.7399978638, 4.86000013351, 39.0299987793, 53.6300010681, 39.3199996948, 
114.559997559, 65.4000015259, 96.2900009155, 41.75, 28.9099998474, 
25.1900005341, 14, 50.3100013733, 47.2999992371, 53.8199996948, 
91.3199996948, 77.6800003052, 17.8099994659, 96.5400009155, 27.6900005341, 
34.1399993896, 34.3300018311, 15.5600004196, 158.449996948, 45.2999992371, 
36.8100013733, 50.6800003052, 62.9000015259, 13.7399997711, 49.4399986267, 
66.1100006104, 95.4400024414, 41.8400001526, 69.8300018311, 60.6599998474, 
34.3300018311, 97.1399993896, 84.3099975586, 25.8099994659, 185.899993896, 
34.8899993896, 304.380004883, 15.1300001144, 29.1399993896, 133.529998779, 
208.910003662, 152.960006714), sale_channel = c("online", "online", 
"travel agency", "online", "online", "online", "travel agency", 
"online", "travel agency", "travel agency", "travel agency", 
"online", "travel agency", "online", "online", "travel agency", 
"online", "travel agency", "travel agency", "call center", "online", 
"online", "online", "travel agency", "travel agency", "travel agency", 
"online", "online", "online", "online", "online", "travel agency", 
"travel agency", "online", "travel agency", "call center", "online", 
"travel agency", "online", "online", "travel agency", "travel agency", 
"online", "travel agency", "online", "online", "online", "travel agency", 
"online", "travel agency", "travel agency", "online", "online", 
"online", "online", "online", "online", "online", "online", "travel agency", 
"travel agency", "travel agency", "online", "travel agency", 
"online"), total = c(876.0300006858, 250.5000009536, 540.3500022886, 
420.1500110628, 424.9299983977, 512.1099882118, 1233.9500045785, 
316.9000034332, 250.5000009536, 316.9000034332, 316.9000034332, 
1233.9500045785, 424.9299983977, 1233.9500045785, 876.0300006858, 
876.0300006858, 424.9299983977, 1233.9500045785, 1233.9500045785, 
185.52999830291, 876.0300006858, 420.1500110628, 876.0300006858, 
185.52999830291, 540.3500022886, 255.6000022889, 250.5000009536, 
512.1099882118, 250.5000009536, 540.3500022886, 540.3500022886, 
255.6000022889, 316.9000034332, 420.1500110628, 255.6000022889, 
250.5000009536, 540.3500022886, 250.5000009536, 1233.9500045785, 
255.6000022889, 512.1099882118, 512.1099882118, 1233.9500045785, 
420.1500110628, 1233.9500045785, 540.3500022886, 424.9299983977, 
512.1099882118, 185.52999830291, 316.9000034332, 316.9000034332, 
420.1500110628, 876.0300006858, 1233.9500045785, 876.0300006858, 
424.9299983977, 876.0300006858, 512.1099882118, 424.9299983977, 
1233.9500045785, 250.5000009536, 876.0300006858, 876.0300006858, 
1233.9500045785, 420.1500110628)), row.names = c(3L, 
26L, 37L, 42L, 48L, 82L, 89L, 100L, 112L, 124L, 133L, 144L, 148L, 
150L, 166L, 167L, 173L, 182L, 217L, 243L, 259L, 285L, 300L, 304L, 
306L, 336L, 341L, 366L, 388L, 397L, 413L, 417L, 423L, 452L, 457L, 
473L, 474L, 478L, 482L, 483L, 486L, 496L, 499L, 504L, 510L, 513L, 
529L, 531L, 558L, 605L, 615L, 628L, 629L, 664L, 669L, 672L, 684L, 
722L, 730L, 752L, 766L, 767L, 779L, 804L, 819L), class = c("data.table", 
"data.frame"))

Any recommendations on how to approach this?

Edit: Final Solution

df <- df[df$booking_date == '2018-11-02',]
total <- aggregate(price ~ user, df, sum)
top_10 <- total[order(total$price, decreasing = T), ]
top_10[1:5, 1]

That gives me

"user7" "user4" "user3" "user2" "user1"
Darren Tsai
  • 32,117
  • 5
  • 21
  • 51
Jack Someone
  • 97
  • 1
  • 10
  • try `df$total <- ave(df$price, df$user, FUN = sum)`. – Darren Tsai Feb 12 '20 at 18:51
  • thanks, goes into the right direction, but how do I now actually group by the user or at least retreive the n number of biggest spenders? – Jack Someone Feb 12 '20 at 19:03
  • Please provide [a minimal reproducible data](https://stackoverflow.com/q/5963269/10068985) and your expected output. You can produce a example data or `dput()` part of your real data. – Darren Tsai Feb 12 '20 at 19:14
  • I edited my initial question, the desired output is under point 3 – Jack Someone Feb 12 '20 at 19:21
  • `aggregate(price ~ user, df, sum)` solve your question? You just need to sort the price to see who spends the most. – Darren Tsai Feb 12 '20 at 19:31
  • Thanks, but only sort of. I tried both using ```order()``` and ```sort()``` to get to the final solution. But I still don't manage to display both columns price and users sorted by price decreasing or just the users sorted decreasing by the values in thee prices column... – Jack Someone Feb 12 '20 at 19:57

1 Answers1

0

Try this

total <- aggregate(price ~ user, df, sum)
total[order(total$price, decreasing = T), ]

or

dplyr::arrange(total, desc(price))

or

sort(tapply(df$price, df$user, sum), decreasing = T)
Darren Tsai
  • 32,117
  • 5
  • 21
  • 51
  • That does the trick, I'll post my final solution in the original question. I assume there is a more elegant way but it works, thanks! – Jack Someone Feb 12 '20 at 20:20