75

My data looks like this:

# A tibble: 6 x 4
  name          val time          x1
  <chr>       <dbl> <date>     <dbl>
1 C Farolillo     7 2016-04-20  51.5
2 C Farolillo     3 2016-04-21  56.3
3 C Farolillo     7 2016-04-22  56.3
4 C Farolillo    13 2016-04-23  57.9
5 C Farolillo     7 2016-04-24  58.7
6 C Farolillo     9 2016-04-25  59.0

I am trying to use the pivot_wider function to expand out the data based on the name column. I use the following code:

yy <- d %>% 
  pivot_wider(., names_from = name, values_from = val)

Which gives me the following warning message:

Warning message:
Values in `val` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list(val = list)` to suppress this warning.
* Use `values_fn = list(val = length)` to identify where the duplicates arise
* Use `values_fn = list(val = summary_fun)` to summarise duplicates

The output looks like:

       time       x1        out1    out2 
    2016-04-20  51.50000    <dbl>   <dbl>
2   2016-04-21  56.34615    <dbl>   <dbl>
3   2016-04-22  56.30000    <dbl>   <dbl>
4   2016-04-23  57.85714    <dbl>   <dbl>
5   2016-04-24  58.70968    <dbl>   <dbl>
6   2016-04-25  58.96774    <dbl>   <dbl>

I know that here mentions the issue and to resolve it they suggest using summary statistics. However I have time series data and thus do not want to use summary statistics since each day has a single value (and not multiple values).

I know the problem is because the val column has duplicates (i.e. in the above example 7 occurs 3 times.

Any suggestions on how to pivot_wider and overcome this issue?

Data:

    d <- structure(list(name = c("C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", "C Farolillo", 
"C Farolillo", "C Farolillo", "C Farolillo", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", "Plaza Eliptica", 
"Plaza Eliptica", "Plaza Eliptica"), val = c(7, 3, 7, 13, 7, 
9, 20, 19, 4, 5, 5, 2, 6, 6, 16, 13, 7, 6, 3, 3, 6, 10, 5, 3, 
5, 3, 4, 4, 10, 11, 4, 13, 8, 2, 8, 10, 3, 10, 14, 4, 2, 4, 6, 
6, 8, 8, 3, 3, 13, 10, 13, 32, 25, 31, 34, 26, 33, 35, 43, 22, 
22, 21, 10, 33, 33, 48, 47, 27, 23, 11, 13, 25, 31, 20, 16, 10, 
9, 23, 11, 23, 26, 16, 34, 17, 4, 24, 21, 10, 26, 32, 10, 5, 
9, 19, 14, 27, 27, 10, 8, 28, 32, 25), time = structure(c(16911, 
16912, 16913, 16914, 16915, 16916, 16917, 16918, 16919, 16920, 
16921, 16922, 16923, 16923, 16924, 16925, 16926, 16927, 16928, 
16929, 16930, 16931, 16932, 16933, 16934, 16935, 16936, 16937, 
16938, 16939, 16940, 16941, 16942, 16943, 16944, 16945, 16946, 
16947, 16948, 16949, 16950, 16951, 16952, 16953, 16954, 16955, 
16956, 16957, 16958, 16959, 16960, 16911, 16912, 16913, 16914, 
16915, 16916, 16917, 16918, 16919, 16920, 16921, 16922, 16923, 
16923, 16924, 16925, 16926, 16927, 16928, 16929, 16930, 16931, 
16932, 16933, 16934, 16935, 16936, 16937, 16938, 16939, 16940, 
16941, 16942, 16943, 16944, 16945, 16946, 16947, 16948, 16949, 
16950, 16951, 16952, 16953, 16954, 16955, 16956, 16957, 16958, 
16959, 16960), class = "Date"), x1 = c(51.5, 56.3461538461538, 
56.3, 57.8571428571429, 58.7096774193548, 58.9677419354839, 64.4615384615385, 
61.9310344827586, 60.3214285714286, 59.4137931034483, 59.5806451612903, 
57.3448275862069, 64.0333333333333, 64.0333333333333, 70.15625, 
71.3636363636364, 62.8125, 56.4375, 56.4516129032258, 51.741935483871, 
52.84375, 53.09375, 52.969696969697, 54, 54.3870967741936, 60.3870967741936, 
64.4516129032258, 66.2903225806452, 68.2333333333333, 69.7741935483871, 
70.5806451612903, 73.8275862068966, 72.8181818181818, 64.6764705882353, 
64.4838709677419, 68.7741935483871, 62.1764705882353, 68.969696969697, 
70.1935483870968, 59.6774193548387, 59.9677419354839, 63.125, 
67.5882352941177, 71.4705882352941, 73.8529411764706, 76.1935483870968, 
72.6451612903226, 76.0645161290323, 76.4193548387097, 81.7741935483871, 
85.0645161290323, 51.5, 56.3461538461538, 56.3, 57.8571428571429, 
58.7096774193548, 58.9677419354839, 64.4615384615385, 61.9310344827586, 
60.3214285714286, 59.4137931034483, 59.5806451612903, 57.3448275862069, 
64.0333333333333, 64.0333333333333, 70.15625, 71.3636363636364, 
62.8125, 56.4375, 56.4516129032258, 51.741935483871, 52.84375, 
53.09375, 52.969696969697, 54, 54.3870967741936, 60.3870967741936, 
64.4516129032258, 66.2903225806452, 68.2333333333333, 69.7741935483871, 
70.5806451612903, 73.8275862068966, 72.8181818181818, 64.6764705882353, 
64.4838709677419, 68.7741935483871, 62.1764705882353, 68.969696969697, 
70.1935483870968, 59.6774193548387, 59.9677419354839, 63.125, 
67.5882352941177, 71.4705882352941, 73.8529411764706, 76.1935483870968, 
72.6451612903226, 76.0645161290323, 76.4193548387097, 81.7741935483871, 
85.0645161290323)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-102L))
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
user113156
  • 6,761
  • 5
  • 35
  • 81

5 Answers5

122

Create a unique identifier row for each name and then use pivot_wider

library(dplyr)

d %>%
  group_by(name) %>%
  mutate(row = row_number()) %>%
  tidyr::pivot_wider(names_from = name, values_from = val) %>%
  select(-row)

# A tibble: 51 x 4
#   time          x1 `C Farolillo` `Plaza Eliptica`
#   <date>     <dbl>         <dbl>            <dbl>
# 1 2016-04-20  51.5             7               32
# 2 2016-04-21  56.3             3               25
# 3 2016-04-22  56.3             7               31
# 4 2016-04-23  57.9            13               34
# 5 2016-04-24  58.7             7               26
# 6 2016-04-25  59.0             9               33
# 7 2016-04-26  64.5            20               35
# 8 2016-04-27  61.9            19               43
# 9 2016-04-28  60.3             4               22
#10 2016-04-29  59.4             5               22
# … with 41 more rows
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
  • 1
    this gives me an error: Error in `select()`: ! Can't subset columns that don't exist. ✖ Column `row` doesn't exist. – amy989 Oct 26 '22 at 13:07
27

Typically the error

Warning message:
Values in `val` are not uniquely identified; output will contain list-cols.

is most often caused by duplicate rows in the data (after excluding the val column), and not duplicates in the val column.

which(duplicated(d))
# [1] 14 65

OP's data seems to have two duplicate rows which is causing this issue. Removing the duplicate rows also gets rid of the error.

yy <- d %>% distinct() %>% pivot_wider(., names_from = name, values_from = val)
yy
# A tibble: 50 x 4
   time          x1 `C Farolillo` `Plaza Eliptica`
   <date>     <dbl>         <dbl>            <dbl>
 1 2016-04-20  51.5             7               32
 2 2016-04-21  56.3             3               25
 3 2016-04-22  56.3             7               31
 4 2016-04-23  57.9            13               34
 5 2016-04-24  58.7             7               26
 6 2016-04-25  59.0             9               33
 7 2016-04-26  64.5            20               35
 8 2016-04-27  61.9            19               43
 9 2016-04-28  60.3             4               22
10 2016-04-29  59.4             5               22
# ... with 40 more rows
Ameer
  • 496
  • 4
  • 5
  • I wouldn't call the other solution a quick/dirty fix, since there are many vaild cases where this is the right way to do it if multiple values per timepoint are allowed, but since OP said that each timepoint should only have one value, your solution solves the problem of duplicate entries. – Gilean0709 Feb 15 '20 at 06:16
  • Agreed, I can see how it could be useful if there are rows which differ only in the value column. – Ameer Feb 15 '20 at 06:24
  • Removing the duplicate rows in the dataset will cause me to lose time series information. The data contains two different time series `C Farolillo` and `Plaza Eliptica` which just so happens to have the same value on the same day. This isn't a true duplicate, just a coincidence. – user113156 Feb 15 '20 at 12:27
  • Trying `d[c(13,14),]` gives the following two rows: `[1] 13 C Farolillo 6 2016-05-02 64.03333` `[2] 14 C Farolillo 6 2016-05-02 64.03333`. This is two same observations in one day for `C Farolillo`; so it looked like a duplicates to me.Do `d[c(64,65),]` for another pair. – Ameer Feb 16 '20 at 05:26
  • I find this to be the right answer. One can aggregate the duplicated rows in a way before pivoting. For example, we can do `group_by(name, x1) %>% summarise(x1 = sum(x1))` or with `mean` in place of `sum`. At least, this is the use case what I usually encounter in practice. – passerby51 Mar 28 '21 at 20:02
12

The problem is caused by the fact that the data that you want to spread / pivot wider has duplicate identifiers. While both suggestions above, i.e. creating a unique artifical id from row numbers with mutate(row = row_number()) , or filtering only distinct rows will allow you to pivot wider, but they change the structure of your table, which is likely to have an logical, organizational problem that will come out next time you try to join anything to it.

It is a much better practice to use the id_cols parameter explicity, to see that you actually want to have to be unique after pivotting wide, and if you are running into problems, re-organize the original table first. Of course, you may find reason for filtering to distinct rows, or adding a new ID, most likely you will want to avoid the duplication earlier in your code.

Daniel Antal
  • 226
  • 2
  • 9
  • 1
    I'm having similar problems to the ones above but none of these solutions seems applicable to me. I have duplicate values most likely because my data involves different ratings at different time points. I tried using id_cols but this doesn't work either. – Con Des Mar 10 '20 at 02:52
  • 2
    In this case obviously your observations must be unique, inter alia, in time. So the id_cols must take account of all possible time observations. One way to achieve this is to unite _ – Daniel Antal Mar 10 '20 at 15:00
  • I have tried this but not sure how to do it in the long-form first before using pivot_wider. For some reason, the same id number was assigned for two observations a couple of times. – Con Des Mar 12 '20 at 04:52
  • So I don't want to get rid of the duplicates, instead, I'd like to change the duplicated id number – Con Des Mar 12 '20 at 04:54
  • @ConDes did you try someting like: `df_wide %>% group_by(old_ID, time_point) %>% mutate(new_ID = paste0(old_ID, "_", 0:n()))` ? – fabern May 14 '20 at 14:42
8

Though not visible in the OP example, the accepted answer will duplicate rows in some cases when it is not necessary. This approach avoids that in some cases:

d %>%
  pivot_wider(names_from = name, values_from = val
             , values_fn = list) %>% 
    unnest(cols = everything() )

If list present, to avoid warnings and errors keep values_fn = list

Example:

d1 <- tail(d)[1:5,]
d5<-d1
d5$name<-"some"
withlist  <- tibble(d1, l = list(c(1,2),c(1,2),c(1,2),c(1,2),c(1,2) ) )
withlist2 <- tibble(d5, l = list( list(1,2),list(1,2),list(1,2,3),list(1,2),list(1,2) ) )
withl     <- rbind(withlist,withlist2)

res<-withl %>%
  pivot_wider(names_from = name, values_from = l
              , values_fn = list)     
as.data.frame(res)

#  val       time    x1 Plaza Eliptica    some
#1  27 2016-06-03 76.19           1, 2    1, 2
#2  10 2016-06-04 72.65           1, 2    1, 2
#3   8 2016-06-05 76.06           1, 2 1, 2, 3
#4  28 2016-06-06 76.42           1, 2    1, 2
#5  32 2016-06-07 81.77           1, 2    1, 2
Ferroao
  • 3,042
  • 28
  • 53
-2

This is kind of late in the game, but an option to keep the non-unique observations, but still pivot:

table(d$name) # get the unique names_from and frequencies
# 
#    C Farolillo Plaza Eliptica 
#             51             51  

(d2 <- d %>% mutate(rno = rep(1:51, 2)) %>% 
                  # repeat 1:51 2 times; unique id by names_from

      pivot_wider(names_from = name, values_from = val))
    # # A tibble: 51 × 5
    #    time          x1   rno `C Farolillo` `Plaza Eliptica`
    #    <date>     <dbl> <int>         <dbl>            <dbl>
    #  1 2016-04-20  51.5     1             7               32
    #  2 2016-04-21  56.3     2             3               25
    #  3 2016-04-22  56.3     3             7               31
    #  4 2016-04-23  57.9     4            13               34
    #  5 2016-04-24  58.7     5             7               26
    #  6 2016-04-25  59.0     6             9               33
    #  7 2016-04-26  64.5     7            20               35
    #  8 2016-04-27  61.9     8            19               43
    #  9 2016-04-28  60.3     9             4               22
    # 10 2016-04-29  59.4    10             5               22
    # # … with 41 more rows 
Kat
  • 15,669
  • 3
  • 18
  • 51