1

FYI: I have a solution to this problem but I wanted to post the solution for anyone with a similar problem.

I have a data set in long format

dat = data.frame(id = rep(1:2, each=10),
             x1 = runif(20),
             x2 = rnorm(20),
             x3 = letters[1:20]
             )

with a column identifying the order of rows within each id

dat$id_row = ave(as.character(dat$id), as.character(dat$id), FUN = seq_along)

and some values only occur once per id

dat$x3[dat$id_row>1] = NA

So dat looks like this:

   id         x1         x2   x3 id_row
1   1 0.54788708 -0.3870533    a      1
2   1 0.41625779  0.1528117 <NA>      2
3   1 0.26278100 -0.9239091 <NA>      3
4   1 0.69968279  1.2025596 <NA>      4
5   1 0.26647728  0.3301025 <NA>      5
6   1 0.72934348  0.3218639 <NA>      6
7   1 0.43240532 -0.3136323 <NA>      7
8   1 0.98646191 -2.0612792 <NA>      8
9   1 0.69606418  1.7286719 <NA>      9
10  1 0.54025279  1.4872084 <NA>     10
11  2 0.33639210 -0.1692205    k      1
12  2 0.05372062  0.9645393 <NA>      2
13  2 0.46421037 -1.1011200 <NA>      3
14  2 0.66753436 -1.5613799 <NA>      4
15  2 0.72302407 -0.9173485 <NA>      5
16  2 0.26640179  0.7012415 <NA>      6
17  2 0.37752229 -0.1136814 <NA>      7
18  2 0.45110511 -0.2051627 <NA>      8
19  2 0.47969921  1.4154800 <NA>      9
20  2 0.23142331 -0.6178061 <NA>     10

I want to convert it to wide form using data.table::dcast like this

library(data.table)
setDT(dat, key="id")

cols = colnames(dat)
cols = cols[!cols %in% c("id", "id_row")]  #all the columns other than these

dat_wide = dcast(dat, id ~ id_row,
                   value.var = cols)

So dat_wide looks like this:

   id      x1_1     x1_10       x1_2      x1_3      x1_4      x1_5      x1_6      x1_7      x1_8      x1_9
1:  1 0.5478871 0.5402528 0.41625779 0.2627810 0.6996828 0.2664773 0.7293435 0.4324053 0.9864619 0.6960642
2:  2 0.3363921 0.2314233 0.05372062 0.4642104 0.6675344 0.7230241 0.2664018 0.3775223 0.4511051 0.4796992
         x2_1      x2_10      x2_2       x2_3     x2_4       x2_5      x2_6       x2_7       x2_8     x2_9
1: -0.3870533  1.4872084 0.1528117 -0.9239091  1.20256  0.3301025 0.3218639 -0.3136323 -2.0612792 1.728672
2: -0.1692205 -0.6178061 0.9645393 -1.1011200 -1.56138 -0.9173485 0.7012415 -0.1136814 -0.2051627 1.415480
    x3_1 x3_10 x3_2 x3_3 x3_4 x3_5 x3_6 x3_7 x3_8 x3_9
1:     a    NA   NA   NA   NA   NA   NA   NA   NA   NA
2:     k    NA   NA   NA   NA   NA   NA   NA   NA   NA

I would like the following:

  1. Leading zeros for the column names (eg x1_01 instead of x1_1)
  2. Drop all of the empty columns
  3. For the columns that have only one observation, remove the _1 that dcast added.
Frank
  • 66,179
  • 8
  • 96
  • 180
JRF1111
  • 172
  • 7
  • Re `cols[!cols %in% c("id", "id_row")]`, fyi you can use `setdiff(cols, c("id", "id_row"))` instead. Also, `dat[, id_row := rowid(id)]` should work. – Frank Apr 11 '18 at 16:20

2 Answers2

2

You can do...

dcast(
  copy(dat)[, 
    x3 := first(na.omit(x3)), by=id][, 
    id_row := sprintf("%02d", as.integer(id_row))], 
  id + x3 ~ id_row,
  value.var = c("x1", "x2")
)

   id x3     x1_01     x1_02      x1_03      x1_04     x1_05     x1_06     x1_07      x1_08      x1_09     x1_10     x2_01      x2_02      x2_03      x2_04      x2_05     x2_06      x2_07      x2_08      x2_09        x2_10
1:  1  a 0.1378046 0.5520807 0.05924109 0.06332558 0.7777398 0.9895027 0.2064026 0.03098261 0.95197243 0.7477726 0.7604617 -0.3261378  0.8879344  0.5863483 -0.7902251 1.5264813 -0.7777892 -0.6412351 -0.7436965 -0.006662512
2:  2  k 0.3795247 0.8378224 0.66084335 0.34242055 0.6004159 0.6608979 0.4316877 0.63934958 0.02383587 0.1881800 1.9792712  0.9256477 -0.7791544 -0.9860244  0.1341959 0.1352514  0.5140671 -0.1055499 -1.4497458 -1.526578088

That is, fill in the x3 values since apparently they should apply to all rows per id, and treat x3 as a grouping variable; apply formatting to id_row; then dcast.

You can use setcolorder afterwards if x3 ought to be at the end.


A cleaner but longer way to handle x3 (which apparently is an attribute of id) would be to make an id table:

# create id table
idDT = dat[!is.na(x3), .(id, x3)]
setkey(idDT, id)

# drop id attributes from main table
dat[, x3 := NULL]

# maybe drop id_row since it can be made on the fly
dat[, id_row := NULL]

# go wide
res = dcast(dat, id ~ sprintf("%02d", rowid(id)), value.var = setdiff(names(dat), "id"))

# add attributes back
res[idDT, on=key(idDT), x3 := i.x3]

   id     x1_01     x1_02      x1_03      x1_04     x1_05     x1_06     x1_07      x1_08      x1_09     x1_10     x2_01      x2_02      x2_03      x2_04      x2_05     x2_06      x2_07      x2_08      x2_09        x2_10 x3
1:  1 0.1378046 0.5520807 0.05924109 0.06332558 0.7777398 0.9895027 0.2064026 0.03098261 0.95197243 0.7477726 0.7604617 -0.3261378  0.8879344  0.5863483 -0.7902251 1.5264813 -0.7777892 -0.6412351 -0.7436965 -0.006662512  a
2:  2 0.3795247 0.8378224 0.66084335 0.34242055 0.6004159 0.6608979 0.4316877 0.63934958 0.02383587 0.1881800 1.9792712  0.9256477 -0.7791544 -0.9860244  0.1341959 0.1352514  0.5140671 -0.1055499 -1.4497458 -1.526578088  k
Frank
  • 66,179
  • 8
  • 96
  • 180
  • 1
    Nice answer. I really like the `sprintf("%02d", rowid(id))` part. I wasn't familiar with that function. That said, I was looking for something that wouldn't require looking at the data to figure out which columns only have one observation (or aren't always observed) and then specify them. That was on me. I should have stated that part explicitly in the question rather than vaguely hinting at it in the title, and I should've made a better example. I would have done those things but I didn't expect any one else to post an answer given the first line of the question. – JRF1111 Apr 12 '18 at 15:30
  • No problem. Just figured I'd share this approach since I end up using something like it pretty often anyways. I'm ambivalent about creating rowid on the fly versus having a column for it. If the row order really matters, probably better to retain it explicitly as a column (so it can be ordered on, etc), which is why I wrote "maybe drop" it. – Frank Apr 12 '18 at 15:51
1

To add leading zeros, I do this:

colnames(dat_wide) = gsub("(*_)(\\d{1})$",  "\\10\\2", colnames(dat_wide))
#'  gsub explanation:
#'  arg1, first parentheses : find any string (of any length) that is followed by an underscore
#'  arg1, second parentheses: find any single digit ("\\d" for any digit(s) and {1} to limit the number of matches to 1)
#'  arg1, $: ensures that the regex in the second parentheses is at the end of the string
#'  arg2, "\\1": the value in first parentheses of arg1
#'  arg2, 0: zero gets inserted
#'  arg2, "\\2": the value in second parentheses of arg1

From there, I can drop all the empty columns like this

class(dat_wide) = "data.frame"  #necessary for the next step
dat_wide= dat_wide[, names(which(sapply(dat_wide, function(x) mean(is.na(x)))!=1))]
    # sapply returns a vector giving the percent missing in each column, 100% missing = 1
    # names(which(sapply(...)!=1)) gives the names of the columns that are not 100% missing

and I remove the _1 that dcast added like this

cols = colnames(dat_wide)

cols_alpha = gsub("_\\d{0,}$", "", cols)
# removes the underscore and any digits occurring at the end of the string


cols_unique = cols_alpha[!(cols_alpha %in% cols_alpha[duplicated(cols_alpha)])]
#base column names that aren't repeated

for(i in 1:length(cols_unique)){
  temp = match.arg(cols_unique[i], cols)
  colnames(dat_wide) = gsub(temp, cols_unique[i], colnames(dat_wide), fixed = T )
}

and now dat_wide looks like this (the way I wanted it)

dat_wide
  id      x1_01     x1_10      x1_02     x1_03     x1_04     x1_05     x1_06      x1_07     x1_08     x1_09
1  1 0.05139797 0.6901079 0.78155587 0.5717956 0.8652542 0.1341294 0.6745674 0.97447287 0.7684123 0.9038830
2  2 0.49687041 0.9880967 0.07189928 0.1835206 0.3563691 0.2008427 0.9795765 0.06875338 0.3590017 0.1253108
       x2_01      x2_10      x2_02      x2_03    x2_04      x2_05      x2_06     x2_07     x2_08      x2_09 x3
1 0.27216865 -0.6608880 -0.9184810 -1.3895615 2.481393  1.8167519  0.4966844 0.1151766  1.119384  0.2650763  a
2 0.01208705 -0.5245594 -0.5032424 -0.5924594 1.037335 -0.1715831 -0.1698359 1.6956572 -1.275270 -0.1278430  k
JRF1111
  • 172
  • 7