0

Problem

I want to mutate my dataset based with a calculation based on a conditional without filtering that original dataset.

Example

Dataframe

df <- data.frame(amounts = c("2.95", "2.95", "14.95", "14.95", 
    "14.95", "-14.95", "-14.95", "-14.95", "-14.95"), 
    operation_code = c(100, 100, 101, 101, 101, 110, 110, 110, 110), 
    user_id = c(999, 111, 999, 111, 999, 111, 111, 999, 999))

This does the mutation I want but it filters by operation code == 110. I want to keep all of the data from the data.frame and only populate fees for the operation_type 110.

df <- df %>% 
    group_by(user_id) %>%
    filter(operation_code == 110) %>%
    mutate(fees = n() * 20)

This is what the result should look like:

| amounts | operation_code | user_id | fees |
|---------|----------------|---------|------|
| 2.95    | 100            | 999     | NA   |
| 2.95    | 100            | 111     | NA   |
| 14.95   | 101            | 999     | NA   |
| 14.95   | 101            | 111     | NA   |
| 14.95   | 101            | 999     | NA   |
| -14.95  | 110            | 111     | 40   |
| -14.95  | 110            | 111     | 40   |
| -14.95  | 110            | 999     | 40   |
| -14.95  | 110            | 999     | 40   |
Hack-R
  • 22,422
  • 14
  • 75
  • 131
sharly
  • 237
  • 2
  • 7

1 Answers1

2

In addition to the original, straight-forward solution by Hack-R:

df <- data.frame(amounts = c("2.95", "2.95", "14.95", "14.95", "14.95", "-14.95", "-14.95", "-14.95", "-14.95"), operation_code = c(100, 100, 101, 101, 101, 110, 110, 110, 110), user_id = c(999, 111, 999, 111, 999, 111, 111, 999, 999))
require(dplyr)

df$fees <- NA
df[df$operation_code==110,] <- df[df$operation_code==110,] %>% 
                                      group_by(user_id) %>%
                                      mutate(fees = n() * 20)
df
#  amounts operation_code user_id  fees
#   <fctr>          <dbl>   <dbl> <dbl>
#1    2.95            100     999    NA
#2    2.95            100     111    NA
#3   14.95            101     999    NA
#4   14.95            101     111    NA
#5   14.95            101     999    NA
#6  -14.95            110     111    40
#7  -14.95            110     111    40
#8  -14.95            110     999    40
#9  -14.95            110     999    40

Alternate ways of getting to this result include the following:

After grouping by 'user_id', we can use ifelse to create the 'fees' column.

df %>%
   group_by(user_id) %>%
   mutate(fees = ifelse(operation_code == 110, sum(operation_code==110)*20, NA_real_))
#  amounts operation_code user_id  fees
#   <fctr>          <dbl>   <dbl> <dbl>
#1    2.95            100     999    NA
#2    2.95            100     111    NA
#3   14.95            101     999    NA
#4   14.95            101     111    NA
#5   14.95            101     999    NA
#6  -14.95            110     111    40
#7  -14.95            110     111    40
#8  -14.95            110     999    40
#9  -14.95            110     999    40

Or without using ifelse

df %>%
   group_by(user_id) %>% 
   mutate(fees = sum(operation_code==110)*20 * NA^(operation_code!=110))

NOTE: This used the dplyr methods for getting the expected output.


A compact option with data.table would be

library(data.table)
setDT(df)[operation_code == 110, fees := .N * 20 , user_id]
df
#   amounts operation_code user_id fees
#1:    2.95            100     999   NA
#2:    2.95            100     111   NA
#3:   14.95            101     999   NA
#4:   14.95            101     111   NA
#5:   14.95            101     999   NA
#6:  -14.95            110     111   40
#7:  -14.95            110     111   40
#8:  -14.95            110     999   40
#9:  -14.95            110     999   40

Or we can use base R methods

with(df, ave(operation_code, user_id, FUN = 
       function(x) ifelse(x ==110, sum(x==110)*20, NA) ))
#[1] NA NA NA NA NA 40 40 40 40
Hack-R
  • 22,422
  • 14
  • 75
  • 131
akrun
  • 874,273
  • 37
  • 540
  • 662