2

I have data where I have measurements of same subjects on 4 different stages. My goal is to make one graph on which there is both a boxplot, as well as the point of each subject with a line connecting each subject between each stage. The end result would be to combine both the graph below in one single final graph

library(ggplot2)
df_original = data.frame(study_id = c("id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8", "id9", "id10"),
                val_stage1 = runif(10, 5.0, 6.0),
                val_stage2 = runif(10, 5.0, 6.5), 
                val_stage3 = runif(10, 4.7, 5.8), 
                val_stage4 = runif(10, 5.5, 7.0))
df_original
df_plot1 = data.frame(group = "stage1", value = df_original[, "val_stage1"], ID = df_original$study_id)
df_plot2 = data.frame(group = "stage2", value = df_original[, "val_stage2"], ID = df_original$study_id)
df_plot3 = data.frame(group = "stage3", value = df_original[, "val_stage3"], ID = df_original$study_id)
df_plot4 = data.frame(group = "stage4", value = df_original[, "val_stage4"], ID = df_original$study_id)

plot_data = rbind(df_plot1,
                       df_plot2,
                       df_plot3,
                       df_plot4)
ggplot(plot_data, aes(x=group, 
                      y=value, 
                      fill=group
                      )
       ) + 
geom_boxplot(outlier.shape = NA) +
  geom_point()

boxplot

library(lattice)
# source https://publicifsv.sund.ku.dk/~jufo/courses/rm2017/plotRrepeated.pdf
xyplot(value ~ group, group = ID, data = plot_data, type = "b")

repeated measures with lines between stage

ecjb
  • 5,169
  • 12
  • 43
  • 79

2 Answers2

4

A tidyverse approach looks like this:

mylabs <- levels(plot_data$group)
library(tidyverse)
plot_data %>% 
ggplot(aes(x = as.numeric(group), y = value)) + 
  geom_boxplot(aes(group = group, fill = group), outlier.shape = NA) +
  geom_point(aes(color = ID)) +
  geom_line(aes(color = ID)) + 
  theme(legend.position = "none") + 
  labs(x = "Group") +
  scale_x_continuous(breaks = 1:4, labels = mylabs)

enter image description here

Zhiqiang Wang
  • 6,206
  • 2
  • 13
  • 27
2

A solution using boxplot() and lines(). I'm not sure your approach will be easy to read. Maybe you should make the boxplots a little fade out as shown.

lvl <- lapply(plot_data[c("group", "ID")], unique)  # store levels

clr <- Map(function(x, alpha) 
  rainbow(length(x), alpha=alpha), lvl, alpha=c(.1, 1))  # prefab colors
#Note: the alpha controls the fade, here .1 for boxplot and 1 for lines

boxplot(value ~ group, data=plot_data, border="darkgrey", col=clr$group)
sapply(seq(lvl$ID), function(x) 
  lines(value ~ group, data=plot_data[plot_data$ID == lvl$ID[x], ], type="b",
        col=clr$ID[x], lwd=2))
legend("topleft", legend=lvl$ID, lwd=2, col=clr$ID, ncol=2, cex=.8, bty="n")

Yields

enter image description here


Data:

plot_data <- structure(list(group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L), .Label = c("stage1", "stage2", "stage3", "stage4"), class = "factor"), 
    value = c(5.78956733993255, 5.88809484057128, 5.10837934492156, 
    5.4757885155268, 5.97264352883212, 5.67352280486375, 5.20609766873531, 
    5.86022568447515, 5.81150085269473, 5.94000163977034, 6.0783141606953, 
    5.94233451236505, 6.49362113315146, 6.12048651557416, 6.1347389126895, 
    5.20223867462482, 5.87281575519592, 5.73356315004639, 5.75488595489878, 
    6.36840553430375, 4.99227170993108, 5.18667783471756, 4.99904926030431, 
    5.15853247770574, 5.11713153058663, 5.49876751452684, 5.20934719201177, 
    5.02541789095849, 5.01650351425633, 5.5694368747063, 6.42072400392499, 
    5.5407249458367, 5.87118571228348, 6.68436990131158, 6.81803358788602, 
    6.84979289071634, 6.78138321859296, 6.70059150888119, 5.99049715092406, 
    6.45158472727053), ID = structure(c(1L, 3L, 4L, 5L, 6L, 7L, 
    8L, 9L, 10L, 2L, 1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 
    1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 1L, 3L, 4L, 5L, 
    6L, 7L, 8L, 9L, 10L, 2L), .Label = c("id1", "id10", "id2", 
    "id3", "id4", "id5", "id6", "id7", "id8", "id9"), class = "factor")), row.names = c(NA, 
-40L), class = "data.frame")
jay.sf
  • 60,139
  • 8
  • 53
  • 110
  • Great!!! many thanks @jay.sf!! One question: what is the advantage of rearranging the data as you did? – ecjb Jan 11 '20 at 11:27
  • very welcome @ecjb. I'm not sure what you exactly mean by "rearranging"? – jay.sf Jan 11 '20 at 11:30
  • I mean this `list(group = structure)` and `1L, 3L, 4L` business instead the structure of the data frame I originally had. But that's fine tuning and maybe not a very precise question. – ecjb Jan 11 '20 at 11:38
  • Ah, the "data" appendix is just to provide [reproducible data](https://stackoverflow.com/a/5963610/6574038) (because you're using random generation) the Stack Overflow way, you may use your own data. – jay.sf Jan 11 '20 at 11:40