In R transpose and combine multiple dataframes with missing data and blank column names / rename melted columns prior to dcast

Question

I have searched and found many solutions that came close, but never quite worked in the end. This is probably something very simple, for those with experience...

Here is a snippet of my data. This was created automatically from a JSON import by the package jsonlite. The data is very nicely structured, but I am nevertheless helpless. Update2: I have added the relevant data below

    structure(list(rightsize = c(42L, 50L, 52L, 49L, 41L, 41L, 41L, 
41L, 41L, 45L, 47L, 42L, 45L, 46L, 42L, 44L, 44L, 37L, 44L, 41L
), hitlen = c("", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", ""), linegroup = c("_", "_", "_", 
"_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", 
"_", "_", "_", "_"), leftsize = c(46L, 43L, 43L, 37L, 49L, 43L, 
43L, 45L, 45L, 43L, 44L, 46L, 45L, 46L, 44L, 43L, 54L, 45L, 51L, 
47L), leftspace = c("        ", "           ", "           ", 
"                 ", "     ", "           ", "           ", "         ", 
"         ", "           ", "          ", "        ", "         ", 
"        ", "          ", "           ", "", "         ", "   ", 
"       "), Left = list(structure(list(class = c("", "coll", 
""), str = c("patients with ", "chronic", " obstructive pulmonary"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("respect to ", 
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
    class = c("", "coll", ""), str = c("While there is no cure for this ", 
    "chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "strc", "", "coll", ""), str = c(".", 
"</p><p>", "When patients with ", "chronic", " liver")), .Names = c("class", 
"str"), class = "data.frame", row.names = c(NA, 5L)), structure(list(
    class = c("", "coll", ""), str = c("bronchitis , and ", "chronic", 
    " obstructive pulmonary")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("offers the possibility that ", 
"chronic", " lung")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c(" , such as ", 
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
    class = c("", "coll", ""), str = c("always as clear in other ", 
    "chronic", " incurable")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("may have the potential to prevent ", 
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c(" half the estimated cost of all ", 
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("is consistent with the tact that ", 
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("used to treat ", 
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
    class = c("", "coll", ""), str = c("ingredient for dietary therapy of ", 
    "chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("patients with ", 
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
    class = c("", "coll", ""), str = c("greater for ", "chronic", 
    " obstructive pulmonary")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c(" departments , with schemes for ", 
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("postponement of death by means of managing ", 
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("certainly be ", 
"chronic", " obstructive pulmonary")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
    class = c("", "coll", ""), str = c("cardiovascular disease , cancer , other ", 
    "chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = c("", "coll", ""), str = c("terminal illnesses are converted to ", 
"chronic", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L))), Right = list(structure(list(class = "", str = " who may be at risk of developing steroid"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " - plausibly related to exposure to environmental"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " , it can be treated , Black says . Antidepressants"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " ask what they can do to improve their condition"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " [ COPD ] ) was 15 % ( estimated within "), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " is part of the continuum of development"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " ( 70 , 71 ) and sleep apnea . Elevation"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " . Patients with heart failure highlight"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " other than heart disease , and helps us"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " in this country . Furthermore , the portion"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " are multigenic and multifactorial . Therefore"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " . Nasal corticosteroids are increasingly"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " such as diabetes mellitus or hyperlipidemia"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " ( COPD ) concluded exercise relieves dyspnea"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " than for any other disease. 5 The number"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " management in patients with COPD receiving"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " and disability is costly , and it is bound"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = c("", "strc", ""), str = c(" .", "</p><p>", "Much rarer condition , but people"
    )), .Names = c("class", "str"), class = "data.frame", row.names = c(NA, 
3L)), structure(list(class = "", str = " , and in fact those rates have been rising"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "", str = " . The panel 's report is negative about"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L)), Kwic = list(structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = " disease"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L), structure(list(
    class = "col0 coll", str = "diseases"), .Names = c("class", 
"str"), class = "data.frame", row.names = 1L)), toknum = c(580661252L, 
585871494L, 572902309L, 596182644L, 611091300L, 604962106L, 605346237L, 
585102838L, 575701411L, 616556239L, 548908661L, 604489309L, 548601059L, 
617460845L, 585870185L, 591049175L, 581965276L, 592616458L, 592591831L, 
599295354L), rightspace = c("          ", "  ", "", "   ", "           ", 
"           ", "           ", "           ", "           ", "       ", 
"     ", "          ", "       ", "      ", "          ", "        ", 
"        ", "               ", "        ", "           "), Tbl_refs = list(
    "11.99.0023.006", "11.99.0031.001", "11.99.0012.004", "11.99.0046.013", 
    "11.99.0069.003", "11.99.0059.007", "11.99.0060.003", "11.99.0030.001", 
    "11.99.0016.007", "11.99.0077.021", "11.01.0003.015", "11.99.0059.003", 
    "11.01.0003.006", "11.99.0078.034", "11.99.0031.001", "11.99.0038.005", 
    "11.99.0025.005", "11.99.0040.006", "11.99.0040.006", "11.99.0051.011"), 
    ref = c("11.99.0023.006", "11.99.0031.001", "11.99.0012.004", 
    "11.99.0046.013", "11.99.0069.003", "11.99.0059.007", "11.99.0060.003", 
    "11.99.0030.001", "11.99.0016.007", "11.99.0077.021", "11.01.0003.015", 
    "11.99.0059.003", "11.01.0003.006", "11.99.0078.034", "11.99.0031.001", 
    "11.99.0038.005", "11.99.0025.005", "11.99.0040.006", "11.99.0040.006", 
    "11.99.0051.011")), .Names = c("rightsize", "hitlen", "linegroup", 
"leftsize", "leftspace", "Left", "Right", "Kwic", "toknum", "rightspace", 
"Tbl_refs", "ref"), class = "data.frame", row.names = c(NA, 20L
))

What I need to do is 1) transpose these 4 dataframes and assign the values in "class" to be the column headers. Note, #1, the number of columns may differ. Also note (#2) that some of the column names will be "". As such, the wonderful solution here results in dataframes in which some column headings are all filled with junk, making the next step (dataframe merging) impossible, e.g.

""
strc
structure("When patients with ", class = "AsIs")
coll
structure(" liver", class = "AsIs").

(The junk-fill headers seem to be the ones that were "", beyond the first.)

Following that step, I would then need to merge these dataframes, whilst accounting for missing values. Rbind.fill does the trick, but only when the data is sufficiently uniform. I have searched high & low for a solution, and have yet to find one that sufficiently addresses this issue.

Update: I have continued to experiment with melt/cast. The following brings be very close to an acceptable, final solutions:

require(reshape2)
docx <- melt(documentdata$Left, id.vars = c("class"))
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list)

The only problem is, as mentioned, the blank "class" causes the structure to be lost upon dcast: all of the unnamed columns wind up merged and out of order, e.g.

    L1  variable    Var.3   coll    strc
1    1  str patients with ,  obstructive pulmonary  chronic  
2    2  str respect to ,  obstructive pulmonary chronic  
3    3  str While there is no cure for this ,   chronic  
4    4  str ., When patients with ,  liver  chronic </p><p>
5    5  str bronchitis , and ,  obstructive pulmonary   chronic

The key "class" in the og data is the variable "coll", which always has at least one blank before and one blank after. One solution might be to create names "pre-coll" and "post-coll" prior to dcast?

Update #3: here's one possible, albeit ugly solution. Any "cleaner" options?

require(reshape2)
docx <- melt(documentdata$Left, id.vars = c("class"))
pre <- which(docx$class %in% c("coll")) - 1
post <- which(docx$class %in% c("coll")) + 1
docx$class[pre] = "l.pre"
docx$class[post] = "l.post"
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list)
docx.left <- docx[, c("l.pre", "coll", "l.post")]

Thanks in advance for the help.

if you share some of the data so it can be copy and pasted, you are much more likely to get an answer. To do this, try `dput(documentdata)` and paste it into the question — jeremycg, Aug 27 '15 at 01:24

jeremycg · Accepted Answer · 2015-08-27T12:57:52.633

Let's do it with dplyr:

library(dplyr)
documentdata$Left %>% do.call(rbind, .) %>%
                      do(data.frame(pre = .[["str"]][which(.[["class"]]=="coll")-1],
                                    coll = .[["str"]][which(.[["class"]]=="coll")], 
                                    post = .[["str"]][which(.[["class"]]=="coll")+1]))

                                           pre    coll                   post
1                               patients with  chronic  obstructive pulmonary
2                                  respect to  chronic  obstructive pulmonary
3             While there is no cure for this  chronic                       
4                          When patients with  chronic                  liver
5                            bronchitis , and  chronic  obstructive pulmonary
6                 offers the possibility that  chronic                   lung
....
18                               certainly be  chronic  obstructive pulmonary
19    cardiovascular disease , cancer , other  chronic                       
20        terminal illnesses are converted to  chronic

EDIT: an explanation: dplyr has a weird syntax. See the dplyr vignette or the data wrangling cheat sheet. The %>% is the pipe from the magrittr package and simply puts the output of everything on the left of the pipe as the first argument if the function to the right:

5 %>% c(1)
#same as
c(5, 1)

You can use the . to represent the stuff on the left if you want to use it somewhere else instead. You can subset the . if you like (eg the .[["str"]]) :

5 %>% c(1, .)
#same as
c(1, 5)

do allows us to do any computation we want, without worrying about the standard dplyr verbs - it's a wrapper. See ?do.

So the answer takes the documentdata$Left, pipes it into do.call(rbind, .) which collapses the list (so far this is the same as do.call(rbind, documentdata$Left)). The we pipe that to the do which makes a new data frame with the relevant columns selected from the ..

That, sir, is beautiful. Thank you! If possible, would you explain this? I am particularly curious about the "%>%", "do, and "." usage. — Mach5RacerGoGo, Aug 27 '15 at 05:13

In R transpose and combine multiple dataframes with missing data and blank column names / rename melted columns prior to dcast

1 Answers1

Linked