Make "tidy" tibble dataset with matrix variable

Question

I have a dataset in a one-row-per-person format, but I want to merge in a dataset as a matrix variable -- basically, if not for the need of the tidy format, I would use a fuzzy join to merge in data from a range of dates, creating n x 3 rows. Instead, I really want to merge them in as a matrix variable.

I've tried using the fuzzy join followed by a nest function, but my merge dataset contains several variables, so needing to explicitly state my variables for the nest function would be a ton of work and ultimately look messy (though, I suppose if this is my only way forward, I could commit to this).

By way of providing a reproducible example of what I am thinking, I've created the fake dataset below:

person_data <- tibble(ID=paste0("ID", 1:50), outcome=sample(0:1, 50, T), date=sample(5:145, 50, F),min_date=date-5)
value_data <- tibble(date=seq(1:150), exp1=sample(20:100, 150, T), exp2=sample(20:100, 150, T))

My first thought was to combine fuzzy join and the nest function like so:

merged <- fuzzyjoin::fuzzy_left_join(person_data, value_data, by=c("date" = "date", "min_date" = "date"), match_fun=c(`>=`,`<=`))

analysis_data <- merged %>% 
  select(-date.y) %>% 
  group_by(ID) %>% 
  nest(exp1=exp1, exp2=exp2)

The head of the resulting analysis_data tibble is here.

> analysis_data
# A tibble: 50 × 6
# Groups:   ID [50]
   ID    outcome date.x min_date exp1             exp2            
   <chr>   <int>  <int>    <dbl> <list>           <list>          
 1 ID1         1     39       34 <tibble [6 × 1]> <tibble [6 × 1]>
 2 ID2         1     87       82 <tibble [6 × 1]> <tibble [6 × 1]>
 3 ID3         0     23       18 <tibble [6 × 1]> <tibble [6 × 1]>
 4 ID4         1     60       55 <tibble [6 × 1]> <tibble [6 × 1]>
 5 ID5         1    120      115 <tibble [6 × 1]> <tibble [6 × 1]>
 6 ID6         0     35       30 <tibble [6 × 1]> <tibble [6 × 1]>
 7 ID7         0    131      126 <tibble [6 × 1]> <tibble [6 × 1]>
 8 ID8         0     83       78 <tibble [6 × 1]> <tibble [6 × 1]>
 9 ID9         0     68       63 <tibble [6 × 1]> <tibble [6 × 1]>
10 ID10        1    133      128 <tibble [6 × 1]> <tibble [6 × 1]>

And for the most part, it kind of does get me there because the resulting data has 50 rows but is there a better/simpler way of approaching this problem, especially if I had exp1-50 instead of exp1 and exp2? I'd also like to add an index matrix variable, but I'm unsure how to add that.

Realizing it doesn't fit with the rest of my examples, I basically want a dataset that looks like this in the end:

fake_data_goal <- tibble(ID=paste0("ID", 1:50), outcome=sample(0:1, 50, T), var=matrix(data=sample(50:100, 50*3, T), nrow=50), index=matrix(rep(c(1:3), 50), nrow=50, byrow=T))

> fake_data_goal
# A tibble: 50 × 4
   ID    outcome var[,1]  [,2]  [,3] index[,1]  [,2]  [,3]
   <chr>   <int>   <int> <int> <int>     <int> <int> <int>
 1 ID1         0      66    90    71         1     2     3
 2 ID2         0      95    98    75         1     2     3
 3 ID3         0      57    84    91         1     2     3
 4 ID4         1      78    89    64         1     2     3
 5 ID5         1      97    73    95         1     2     3
 6 ID6         0      52    96    76         1     2     3
 7 ID7         0      62    93    57         1     2     3
 8 ID8         0      62    76    81         1     2     3
 9 ID9         1      55    58    67         1     2     3
10 ID10        0      81    91    91         1     2     3

Thank you so much in advance!

Andy Baxter · Accepted Answer · 2022-08-09T23:35:34.040

As perhaps half of an answer which would at least mean you don't have to repeat the typing of exp, exp2... up to exp50 etc., you could pivot_longer before joining and pivot_wider after nesting to give the right number of list columns. These can then be plucked out into matrix columns by pattern matching column names:

library(tidyverse)

person_data <-
  tibble(
    ID = paste0("ID", 1:50),
    outcome = sample(0:1, 50, T),
    date = sample(5:145, 50, F),
    min_date = date - 5
  )

value_data <-
  tibble(
    date = seq(1:150),
    exp1 = sample(20:100, 150, T),
    exp2 = sample(20:100, 150, T)
  )


merged <-
  fuzzyjoin::fuzzy_left_join(
    person_data,
    value_data |> pivot_longer(-date, names_to = "exp", values_to = "val"),
    by = c("date" = "date", "min_date" = "date"),
    match_fun = c(`>=`, `<=`)
  )

merged |> 
  select(-date.y) |> 
  group_by(ID, exp) |> 
  nest(val = val) |> 
  pivot_wider(names_from = exp,
              values_from = val) |> 
  rowwise() |>
  mutate(across(starts_with("exp"), ~ t(as.matrix(.x$val))))
#> # A tibble: 50 × 6
#> # Rowwise:  ID
#>    ID    outcome date.x min_date exp1[,1]  [,2]  [,3]  [,4]  [,5]  [,6] exp2[,1]
#>    <chr>   <int>  <int>    <dbl>    <int> <int> <int> <int> <int> <int>    <int>
#>  1 ID1         0    118      113       96    83    69    59    42    34       60
#>  2 ID2         1      9        4       38    91    83    28    87    73       96
#>  3 ID3         1     83       78       49    31    33    84    57    50       83
#>  4 ID4         1     97       92       91    92    21    53    59    39       23
#>  5 ID5         1     71       66       97    28    56    91    67    43       98
#>  6 ID6         1     27       22       81    88    41    22    24    84       36
#>  7 ID7         1     64       59       46    51    88    76    39    63       53
#>  8 ID8         0     72       67       28    56    91    67    43    55       23
#>  9 ID9         0     80       75       76    79    62    49    31    33       66
#> 10 ID10        0     87       82       57    50    31    72    95    31       66
#> # … with 40 more rows, and 1 more variable: exp2[2:6] <int>

^{Created on 2022-08-10 by the reprex package (v2.0.1)}

As noted, it's not fully answering your question, but would at least get you to the half-way-acceptable stage you'd reached when producing analysis_data.

The things remaining would be:

is this where you expect your exp1 etc. variables to go?
it's not clear where your index values would come from for each row?

Nice answer! OP said "I really want to merge them in as a matrix variable" so I think this is the best solution so far +1 — jared_mamrot, Aug 10 '22 at 00:00
Yes, this hits the nail on the head! I hadn't thought of exploiting the `pivot_wider` and `pivot_longer` functions for this, but it definitely fits my usecase. My exp variables are exactly where I want them to be, and I think I can figure out the index variable part based on some of the ideas presented here. — ktmbiome, Aug 10 '22 at 18:42
glad it worked out! I'll admit, whenever I've created matrix-cols before it's mainly been by accident and I've had to work out how to get rid of them! Very interesting to think over how to create them and how they could be used in selecting columns in later processing! — Andy Baxter, Aug 10 '22 at 20:38
yeah, I'm needing this format for GAM models with lag effects, and since you can basically have all of your lagged data contained in one nicely organized variable that can be added to the model (with the index variable I need to figure out), this is perfect. Basically it means I can do something like `glm(outcome ~ exp1)` and all that lagged data is included in that `exp1` variable call. — ktmbiome, Aug 10 '22 at 21:00
I figured out how to add the index variable I wanted: simply add `mutate(index=t(as.matrix(1:6))` as the final argument to the last data.frame block and I get what I need! Thank you for all your help! And also wanted to confirm that your code worked in my real world data, and I'm beyond thrilled! — ktmbiome, Aug 26 '22 at 00:45

score 2 · Answer 2 · answered Aug 09 '22 at 23:49

This is along the same lines as Andy Baxter's answer, but instead of a having a matrix and an index for each exp this combines the two pieces of information to make each column name i.e. instead of exp[,1] [,2] [,3] index[,1] [,2] [,3] you have exp1_1 exp1_2 exp1_3 exp2_1 exp2_2 exp2_3. Would this suit your use-case?

library(tidyverse)
set.seed(123)

person_data <- tibble(ID=paste0("ID", 1:50), outcome=sample(0:1, 50, T), date=sample(5:145, 50, F),min_date=date-5)
value_data <- tibble(date=seq(1:150), exp1=sample(20:100, 150, T), exp2=sample(20:100, 150, T))

merged <- fuzzyjoin::fuzzy_left_join(person_data, value_data, by=c("date" = "date", "min_date" = "date"), match_fun=c(`>=`,`<=`))

analysis_data <- merged %>% 
  select(-date.y) %>% 
  pivot_longer(starts_with("exp")) %>%
  group_by(ID, name) %>%
  mutate(n = row_number()) %>%
  pivot_wider(names_from = c(name, n),
              values_from = value,
              names_sort = TRUE)
analysis_data
#> # A tibble: 50 × 16
#> # Groups:   ID [50]
#>    ID    outcome date.x min_d…¹ exp1_1 exp1_2 exp1_3 exp1_4 exp1_5 exp1_6 exp2_1
#>    <chr>   <int>  <int>   <dbl>  <int>  <int>  <int>  <int>  <int>  <int>  <int>
#>  1 ID1         0     57      52     39     86     55     71     41     68     59
#>  2 ID2         0    139     134     44     27     74     64     37     61     98
#>  3 ID3         0    145     140     50     25     90     80     67     36     79
#>  4 ID4         1     38      33     74     94     55     58     73     28     55
#>  5 ID5         0     73      68     91     63     51     55     64     33     32
#>  6 ID6         1     76      71     55     64     33     35     52     59     41
#>  7 ID7         1     80      75     52     59     59     29     91     28     55
#>  8 ID8         1     67      62     27     65     85     96     65     89    100
#>  9 ID9         0    101      96     73     96     30     44     71     45     67
#> 10 ID10        0     95      90     45     52     76     48     29     72     99
#> # … with 40 more rows, 5 more variables: exp2_2 <int>, exp2_3 <int>,
#> #   exp2_4 <int>, exp2_5 <int>, exp2_6 <int>, and abbreviated variable name
#> #   ¹min_date
#> # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names

^{Created on 2022-08-10 by the reprex package (v2.0.1)}

I'll admit I do quite like this tidy method too, so +1 here as well :) — Andy Baxter, Aug 10 '22 at 20:39

Make "tidy" tibble dataset with matrix variable

2 Answers2