0

I'd like to restructure my data frame, which features data on whether a home was viewed, purchased, or sold, and what the date for the viewing/purchase/sale was.

The data has been enterred in such a way that whilst there are multiple date variables (var_datevar_A5_3_B32,var_datevar_A5_4_B33,var_datevar_A5_5_B34), each of these variables contains a mix of sales-dates, purchase-dates, and viewing-dates.

So if a want to know if a date in eg var_datevar_A5_3_B32 is a viewing, sales, or purchase date, I need to check variables with a matching prefix (A5_3_B32), ie var_viewed_A5_3_B32, var_sold_A5_3_B32, and var_purchased_A5_3_B32, and see which one is coded as "Y".

I'd like to add three new variables to my data frame: a sales dates only variable (var_date_sold), a purchase dates only variable (var_date_purchased), and a viewing dates only variable (var_date_viewed), like in dfgoal.

I've made an attempt but apart from that I can't figure out how to paste the date into my new variable, I'd like a more general solution because in my real data set I've got loads of variables like this.

Thanks for any help!

# Starting point 
df <- data.frame(var_home=c("a","b","c","d","e"),
                 var_datevar_A5_3_B32=c("01-12-2012","05-06-2014","03-09-2015","03-03-2016",NA),var_datevar_A5_4_B33=c("07-12-2012",NA,"07-07-2017","13-03-2016",NA),var_datevar_A5_5_B34=c("05-03-2013",NA,NA,"23-03-2016",NA),var_purchased_A5_3_B32=c(NA,"Y",NA,NA,NA),var_purchased_A5_4_B33=c("N","N","Y",NA,NA),var_purchased_A5_5_B34=c("Y",NA,NA,NA,NA),
                 var_viewed_A5_3_B32=c("Y","N","Y","Y",NA),var_viewed_A5_4_B33=c(NA,NA,NA,"Y","N"),var_viewed_A5_5_B34=c(NA,NA,NA,"Y",NA),var_sold_A5_3_B32=c("NA","N",NA,"N",NA),var_sold_A5_4_B33=c("Y","N",NA,"N",NA),var_sold_A5_5_B34=c("N",NA,NA,NA,"Y"))



# Goal
dfgoal <- data.frame(var_home=c("a","b","c","d","e"),
                 var_datevar_A5_3_B32=c("01-12-2012","05-06-2014","03-09-2015","03-03-2016",NA),var_datevar_A5_4_B33=c("07-12-2012",NA,"07-07-2017","13-03-2016",NA),var_datevar_A5_5_B34=c("05-03-2013",NA,NA,"23-03-2016",NA),var_purchased_A5_3_B32=c(NA,"Y",NA,NA,NA),var_purchased_A5_4_B33=c("N","N","Y",NA,NA),var_purchased_A5_5_B34=c("Y",NA,NA,NA,NA),
                 var_viewed_A5_3_B32=c("Y","N","Y","Y",NA),var_viewed_A5_4_B33=c(NA,NA,NA,"Y","N"),var_viewed_A5_5_B34=c(NA,NA,NA,"Y",NA),var_sold_A5_3_B32=c("NA","N",NA,"N",NA),var_sold_A5_4_B33=c("Y","N",NA,"N",NA),var_sold_A5_5_B34=c("N",NA,NA,NA,"Y"),
                 var_date_viewed =c("01-12-2012",NA,"03-09-2015","03-03-2016,13-03-2016,23-03-2016",NA),var_date_sold=c("07-12-2012",NA,NA,NA,NA),var_date_purchased=c("05-03-2013","05-06-2014","07-07-2017",NA,NA))



# Attempt pseudo-code 
var_date_viewed <- ifelse((!is.na(var_datevar_A5_3_B32) & var_viewed_A5_3_B32=="Y") | (!is.na(var_datevar_A5_4_B33) & var_viewed_A5_4_B33=="Y") | (!is.na(var_datevar_A5_5_B34) & var_viewed_A5_5_B34=="Y"), PASTE_THE_DATE,NA)
LLL
  • 723
  • 1
  • 9
  • 27

1 Answers1

0

A different approach using the packages dplyr, tidyr and stringr:

library(tidyr)
library(dplyr)
library(stringr)

df %>% 
  gather(datevar, date, contains("datevar")) %>%
  gather(action, action_yn, -datevar, -date, -var_home) %>%
  mutate(date_code = str_extract(datevar, "A\\d_\\d_B\\d\\d$"),
         action_code = str_extract(action, "A\\d_\\d_B\\d\\d$"), 
         action = str_extract(action, "^var_[[a-z][^_]]+"), 
         action = str_remove(action, "^var_")) %>% 
  filter(date_code == action_code) %>% 
  filter(action_yn == "Y") %>% 
  select(-datevar) %>% 
  group_by(var_home, action) %>% 
  summarise(date = str_c(date, collapse = ",")) %>% 
  mutate(action = str_c("var_date_", action)) %>% 
  spread(action, date)

# A tibble: 5 x 4
# Groups:   var_home [5]
#   var_home var_date_purchased var_date_sold var_date_viewed                 
#   <fct>    <chr>              <chr>         <chr>                           
# 1 a        05-03-2013         07-12-2012    01-12-2012                      
# 2 b        05-06-2014         NA            NA                              
# 3 c        07-07-2017         NA            03-09-2015                      
# 4 d        NA                 NA            03-03-2016,13-03-2016,23-03-2016
# 5 e        NA                 NA            NA        

I'll split the code into several parts to explain, what is done.
First transform the data from wide to long format and first gather all datvar and then all actions together in one column each with the corresponding value (date and action_yn).

df <- df %>% 
  gather(datevar, date, contains("datevar")) %>%
  gather(action, action_yn, -datevar, -date, -var_home)

df
#     var_home              datevar       date                 action action_yn
# 1          a var_datevar_A5_3_B32 01-12-2012 var_purchased_A5_3_B32      <NA>
# 2          b var_datevar_A5_3_B32 05-06-2014 var_purchased_A5_3_B32         Y
# 3          c var_datevar_A5_3_B32 03-09-2015 var_purchased_A5_3_B32      <NA>
# 4          d var_datevar_A5_3_B32 03-03-2016 var_purchased_A5_3_B32      <NA>
# 5          e var_datevar_A5_3_B32       <NA> var_purchased_A5_3_B32      <NA>
# ...

Then extract the code (the last part of each column name) with a regular expression. Here I assume, that this is always structured like A#_#_B## where # is a number. Further the specific action is extracted again using a regular exxpression.

df <- df %>%
  mutate(date_code = str_extract(datevar, "A\\d_\\d_B\\d\\d$"),
         action_code = str_extract(action, "A\\d_\\d_B\\d\\d$"), 
         action = str_extract(action, "^var_[[a-z][^_]]+"), 
         action = str_remove(action, "^var_")) 

df
#     var_home              datevar       date    action action_yn date_code action_code
# 1          a var_datevar_A5_3_B32 01-12-2012 purchased      <NA>  A5_3_B32    A5_3_B32
# 2          b var_datevar_A5_3_B32 05-06-2014 purchased         Y  A5_3_B32    A5_3_B32
# 3          c var_datevar_A5_3_B32 03-09-2015 purchased      <NA>  A5_3_B32    A5_3_B32
# 4          d var_datevar_A5_3_B32 03-03-2016 purchased      <NA>  A5_3_B32    A5_3_B32
# 5          e var_datevar_A5_3_B32       <NA> purchased      <NA>  A5_3_B32    A5_3_B32
# ...

As we're only interested in the actions with "Y" and onlöy the dates corresponding to the right action we filter by de codes and action_yn. The datevar is not needed anymore and therefore dropped.

df <- df %>% 
  filter(date_code == action_code) %>% 
  filter(action_yn == "Y") %>% 
  select(-datevar) 

df
#     var_home       date    action action_yn date_code action_code
#  1         b 05-06-2014 purchased         Y  A5_3_B32    A5_3_B32
#  2         c 07-07-2017 purchased         Y  A5_4_B33    A5_4_B33
#  3         a 05-03-2013 purchased         Y  A5_5_B34    A5_5_B34
#  4         a 01-12-2012    viewed         Y  A5_3_B32    A5_3_B32
#  5         c 03-09-2015    viewed         Y  A5_3_B32    A5_3_B32
#  6         d 03-03-2016    viewed         Y  A5_3_B32    A5_3_B32
#  7         d 13-03-2016    viewed         Y  A5_4_B33    A5_4_B33
#  8         d 23-03-2016    viewed         Y  A5_5_B34    A5_5_B34
#  9         a 07-12-2012      sold         Y  A5_4_B33    A5_4_B33
# 10         e       <NA>      sold         Y  A5_5_B34    A5_5_B34

Finally for each var_home and each action, the dates are pasted together (if there are several), the action is renamed (to get the desired final columnnames) and the each action is spread in its own column (with spread):

df <- df %>% 
  group_by(var_home, action) %>% 
  summarise(date = str_c(date, collapse = ",")) %>% 
  mutate(action = str_c("var_date_", action)) %>% 
  spread(action, date)

If you want to have this information together with the original data, assume this is stored in df_org. Then you can use join:

left_join(df_org, df, by = "var_home")
#   var_home var_datevar_A5_3_B32 var_datevar_A5_4_B33  ...  var_sold_A5_4_B33 var_sold_A5_5_B34 var_date_purchased var_date_sold var_date_viewed
# 1        a           01-12-2012           07-12-2012  ...  Y                 N                 05-03-2013         07-12-2012    01-12-2012
# 2        b           05-06-2014                 <NA>  ...  N                 <NA>              05-06-2014         <NA>          <NA>
# 3        c           03-09-2015           07-07-2017  ...  <NA>              <NA>              07-07-2017         <NA>          03-09-2015
# 4        d           03-03-2016           13-03-2016  ...  N                 <NA>              <NA>               <NA>          03-03-2016,13-03-2016,23-03-2016
# 5        e                 <NA>                 <NA>  ...  <NA>              Y                 <NA>               <NA>          <NA>
kath
  • 7,624
  • 17
  • 32