0

I have the following data.table:

Date Segment Variable value
31-12-2021 XXX aa_a_1 10
31-12-2021 XXX bbb_1 11
31-12-2021 XXX ccc_1 12
31-01-2022 XXX aa_a_2 13
31-01-2022 XXX bbb_2 14
31-01-2022 XXX ccc_2 15
28-02-2022 XXX aa_a_3 16
28-02-2022 XXX bbb_3 17
28-02-2022 XXX ccc_3 18
31-03-2022 XXX aa_a_4 19
31-03-2022 XXX bbb_4 20
31-03-2022 XXX ccc_4 21
30-04-2022 XXX aa_a_5 22
30-04-2022 XXX bbb_5 23
30-04-2022 XXX ccc_5 24
31-05-2022 XXX aa_a_6 25
31-05-2022 XXX bbb_6 26
31-05-2022 XXX ccc_6 27
30-06-2022 XXX aa_a_7 28
30-06-2022 XXX bbb_7 29
30-06-2022 XXX ccc_7 30
31-07-2022 XXX aa_a_8 31
31-07-2022 XXX bbb_8 32
31-07-2022 XXX ccc_8 33
31-08-2022 XXX aa_a_9 34
31-08-2022 XXX bbb_9 35
31-08-2022 XXX ccc_9 36
30-09-2022 XXX aa_a_10 37
30-09-2022 XXX bbb_10 38
30-09-2022 XXX ccc_10 39
31-10-2022 XXX aa_a_11 40
31-10-2022 XXX bbb_11 41
31-10-2022 XXX ccc_11 42
30-11-2022 XXX aa_a_12 43
30-11-2022 XXX bbb_12 44
30-11-2022 XXX ccc_12 45
31-12-2021 YYY aa_a_1 46
31-12-2021 YYY bbb_1 47
31-12-2021 YYY ccc_1 48
31-01-2022 YYY aa_a_2 49
31-01-2022 YYY bbb_2 50
31-01-2022 YYY ccc_2 51
28-02-2022 YYY aa_a_3 52
28-02-2022 YYY bbb_3 53
28-02-2022 YYY ccc_3 54
31-03-2022 YYY aa_a_4 55
31-03-2022 YYY bbb_4 56
31-03-2022 YYY ccc_4 57
30-04-2022 YYY aa_a_5 58
30-04-2022 YYY bbb_5 59
30-04-2022 YYY ccc_5 60
31-05-2022 YYY aa_a_6 61
31-05-2022 YYY bbb_6 62
31-05-2022 YYY ccc_6 63
30-06-2022 YYY aa_a_7 64
30-06-2022 YYY bbb_7 65
30-06-2022 YYY ccc_7 66
31-07-2022 YYY aa_a_8 67
31-07-2022 YYY bbb_8 68
31-07-2022 YYY ccc_8 69
31-08-2022 YYY aa_a_9 70
31-08-2022 YYY bbb_9 71
31-08-2022 YYY ccc_9 72
30-09-2022 YYY aa_a_10 73
30-09-2022 YYY bbb_10 74
30-09-2022 YYY ccc_10 75
31-10-2022 YYY aa_a_11 76
31-10-2022 YYY bbb_11 77
31-10-2022 YYY ccc_11 78
30-11-2022 YYY aa_a_12 79
30-11-2022 YYY bbb_12 80
30-11-2022 YYY ccc_12 81

And so on to a total of 24 different dates and 'aaa','bbb' and 'ccc' from 1 to 24.

And the desired output would be:

Date Segment aa_a bbb ccc
31-12-2021 XXX 10 11 12
31-01-2022 XXX 13 14 15
31-12-2021 YYY 16 17 18
31-01-2022 YYY 19 20 21

And so on...

Any idea? I'm blocked at this point.

gojkokacar
  • 19
  • 5

2 Answers2

3

This is just reshaping. Since you mentioned :

library(data.table)
dcast(Date +  Segment ~ Variable, value.var = "value",
      data = DT[, Variable := gsub("_[0-9]+$", "", Variable)])
#           Date Segment aa_a bbb ccc
#  1: 28-02-2022     XXX   16  17  18
#  2: 28-02-2022     YYY   52  53  54
#  3: 30-04-2022     XXX   22  23  24
#  4: 30-04-2022     YYY   58  59  60
#  5: 30-06-2022     XXX   28  29  30
#  6: 30-06-2022     YYY   64  65  66
#  7: 30-09-2022     XXX   37  38  39
#  8: 30-09-2022     YYY   73  74  75
#  9: 30-11-2022     XXX   43  44  45
# 10: 30-11-2022     YYY   79  80  81
# 11: 31-01-2022     XXX   13  14  15
# 12: 31-01-2022     YYY   49  50  51
# 13: 31-03-2022     XXX   19  20  21
# 14: 31-03-2022     YYY   55  56  57
# 15: 31-05-2022     XXX   25  26  27
# 16: 31-05-2022     YYY   61  62  63
# 17: 31-07-2022     XXX   31  32  33
# 18: 31-07-2022     YYY   67  68  69
# 19: 31-08-2022     XXX   34  35  36
# 20: 31-08-2022     YYY   70  71  72
# 21: 31-10-2022     XXX   40  41  42
# 22: 31-10-2022     YYY   76  77  78
# 23: 31-12-2021     XXX   10  11  12
# 24: 31-12-2021     YYY   46  47  48
#           Date Segment aa_a bbb ccc

FYI, regex (and an extensive answer about regex):

  • _ is a literal underscore character.
  • [0-9] means a (single) "character class" that contains characters between "0" and "9", not numbers between 0 and 9. For instance, 19 is two characters, each are between 0 and 9.
  • + means "one or more".
  • $ means "end of string".

So "aaa_24" and "aa_a_9999999" will be updated; "aaa_" and "aaa_z" and "aaa_24b" will not.


Data

DT <- setDT(structure(list(Date = c("31-12-2021", "31-12-2021", "31-12-2021", "31-01-2022", "31-01-2022", "31-01-2022", "28-02-2022", "28-02-2022", "28-02-2022", "31-03-2022", "31-03-2022", "31-03-2022", "30-04-2022", "30-04-2022", "30-04-2022", "31-05-2022", "31-05-2022", "31-05-2022", "30-06-2022", "30-06-2022", "30-06-2022", "31-07-2022", "31-07-2022", "31-07-2022", "31-08-2022", "31-08-2022", "31-08-2022", "30-09-2022", "30-09-2022", "30-09-2022", "31-10-2022", "31-10-2022", "31-10-2022", "30-11-2022", "30-11-2022",  "30-11-2022", "31-12-2021", "31-12-2021", "31-12-2021", "31-01-2022", "31-01-2022", "31-01-2022", "28-02-2022", "28-02-2022", "28-02-2022", "31-03-2022", "31-03-2022", "31-03-2022", "30-04-2022", "30-04-2022", "30-04-2022", "31-05-2022", "31-05-2022", "31-05-2022", "30-06-2022", "30-06-2022", "30-06-2022", "31-07-2022", "31-07-2022", "31-07-2022", "31-08-2022", "31-08-2022", "31-08-2022", "30-09-2022", "30-09-2022", "30-09-2022", "31-10-2022", "31-10-2022", "31-10-2022", "30-11-2022", "30-11-2022",  "30-11-2022"), Segment = c("XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY",  "YYY", "YYY", "YYY", "YYY"), Variable = c("aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc",  "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc"), value = 10:81), row.names = c(NA, -72L), class = c("data.table", "data.frame")))
r2evans
  • 141,215
  • 6
  • 77
  • 149
  • This could work. Problem is that Variable is going from 1 to 24, and is giving to me some strange output... – gojkokacar Jan 06 '22 at 17:37
  • If you update your data to better represent the issue, I can look into it. – r2evans Jan 06 '22 at 19:08
  • I updated the table but I put 12 instead of 24; also I tried `dcast(Date + Segment ~ Variable, value.var = "value",data = DT[, Variable := gsub("_[1-24]+$", "", Variable])` – gojkokacar Jan 07 '22 at 09:31
  • Also, I have edited the Variable `aaa_X` to `aa_a_X` so maybe `gsub` will not work... – gojkokacar Jan 07 '22 at 10:09
  • There was a typo in the answer causing a syntax/parsing problem, it must have been messed up when I pasted to SO, sorry about that. I've fixed that and updated to your new data. The new column name did not change `gsub`'s behavior, I added a quick explanation on the regex. – r2evans Jan 07 '22 at 12:21
  • 1
    Thank you very much, that'll work. Much appreciated! – gojkokacar Jan 07 '22 at 15:58
2

We can separate the 'Variable' column into 'two' and then use pivot_wider to reshape to 'wide' format

library(dplyr)
library(tidyr)
df1 %>%
  separate(Variable, into = c('Var1', 'Var2')) %>% 
  pivot_wider(names_from = Var1, values_from = value) %>% 
  select(-Var2)

-output

# A tibble: 4 × 5
  Date       Segment   aaa   bbb   ccc
  <chr>      <chr>   <int> <int> <int>
1 31-12-2021 XXX        10    11    12
2 31-01-2022 XXX        13    14    15
3 31-12-2021 YYY        16    17    18
4 31-01-2022 YYY        19    20    21

Or use dcast

library(data.table)
dcast(setDT(df1), Date + Segment ~ trimws(Variable, whitespace = "_.*"))
         Date Segment aaa bbb ccc
1: 31-01-2022     XXX  13  14  15
2: 31-01-2022     YYY  19  20  21
3: 31-12-2021     XXX  10  11  12
4: 31-12-2021     YYY  16  17  18

data

df1 <- structure(list(Date = c("31-12-2021", "31-12-2021", "31-12-2021", 
"31-01-2022", "31-01-2022", "31-01-2022", "31-12-2021", "31-12-2021", 
"31-12-2021", "31-01-2022", "31-01-2022", "31-01-2022"), Segment = c("XXX", 
"XXX", "XXX", "XXX", "XXX", "XXX", "YYY", "YYY", "YYY", "YYY", 
"YYY", "YYY"), Variable = c("aaa_1", "bbb_1", "ccc_1", "aaa_2", 
"bbb_2", "ccc_2", "aaa_1", "bbb_1", "ccc_1", "aaa_2", "bbb_2", 
"ccc_2"), value = 10:21), class = "data.frame", row.names = c(NA, 
-12L))
akrun
  • 874,273
  • 37
  • 540
  • 662
  • I just edited the post, so it is a total of 24 different dates and variables. Also, I am not able to add 'whitespace' in my trimws function. – gojkokacar Jan 06 '22 at 17:59
  • @gojkokacar it shouldn't matter as the solution i posted is general. `trimws` have that argument. Not clear. Please check your R version, package versions – akrun Jan 06 '22 at 18:00
  • I have this one: https://www.rdocumentation.org/packages/memisc/versions/0.97/topics/trimws when I put trimws function in help page. Sadly I am not able to update R version... – gojkokacar Jan 06 '22 at 18:04
  • @gojkokacar `trimws` is a `base R` function. You don't need any packages – akrun Jan 06 '22 at 18:09
  • @gojkokacar you can check [here](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/trimws) – akrun Jan 06 '22 at 18:09
  • I know @akrun, but it seems that 'whitespace' is not being recognized in this function in my case... a bit strange. Maybe with a higher R version could work.. – gojkokacar Jan 06 '22 at 18:16
  • you may have loaded the package memisc, thus the trimws from base R is masked from that function – akrun Jan 06 '22 at 18:17
  • @gojkokacar you could use `::` if there is a masking i.e. `base::trimws("hello_1", whitespace = "_.*")# [1] "hello"` – akrun Jan 06 '22 at 18:30
  • Still the same issue with base::trimws, it seems my base function of trimws is not recognizing 'whitespace' – gojkokacar Jan 07 '22 at 07:38
  • My #R version is 3.5.3 – gojkokacar Jan 07 '22 at 07:43
  • @gojkokacar your column names in the new update pattern is different. Please post only the one that mimics your data. – akrun Jan 07 '22 at 16:23