Boxplot spacing and reordering Y-axis variables

Question

I am trying to make boxplots using ggplot2 to display survey data- Participants rated statements using Likert scale ratings (1-5). My data includes statement codes as columns (i.e. know_1) and participant IDs as rows and within the cells are the rating values (1-5 or NA). Here are my steps:

#### Convert to long form

survey_long <- survey %>%
  gather(key = "domain", value = "TDF_score", know_1:change32)

#### Convert to factors

survey_long <- survey_long %>%
  mutate(domain = factor(domain,
    levels = c("know_1", "know_2", "know_3", "beh_reg4", "beh_reg5", "beh_reg6", "belief_conseq7", "belief_conseq8", "belief_conseq9", "capacity10", "capacity11", "capacity12", "skills13", "skills14", "skills15", "environ16", "environ17", "environ18", "environ19", "role20", "emotion21", "memory22", "goal23", "reinforce24", "reinforce25", "optimism26", "change27", "change28", "change29", "change30", "change31", "change32"),
    labels = c("know_1", "know_2", "know_3", "beh_reg4", "beh_reg5", "beh_reg6", "belief_conseq7", "belief_conseq8", "belief_conseq9", "capacity10", "capacity11", "capacity12", "skills13", "skills14", "skills15", "environ16", "environ17", "environ18", "environ19", "role20", "emotion21", "memory22", "goal23", "reinforce24", "reinforce25", "optimism26", "change27", "change28", "change29", "change30", "change31", "change32")
  ))

#### Boxplot

ggplot(survey_long, aes(x = domain, y = TDF_score, fill = domain)) +
  geom_boxplot() +
  theme(axis.title.x = element_blank(), 
        axis.text.x = element_text(size = 10), 
        axis.text.y = element_text(color = "grey20", size = 10, vjust = 0.5), 
        axis.title.y = element_text(color = "grey20", size = 10)) +
  coord_flip() +
  ggtitle("Total sample-Domains of Behavior Change") +
  theme(legend.position = "none") +
  ylab("Likert Scale Rating \n(1 Strongly Disagree - 5 Strongly Agree)") +
  cleanup

I would like to make the following changes to this graph but I am struggling with getting the result I would like

I plan to change the y-axis labels to the actual statements so I would need to wrap the text to fit a longer statement and allow for spacing between each boxplot.
Reorder the statements on the y-axis so that the items with sample distributions with lower scores (ie- 2-3) are lower on the graph and the distributions with higher scores (>4) are on the top of the graph.
I also want to change the colors by selecting a color for each bar

I included a sub-set of the data to reproduce results using dput. Thank you so much for your help

structure(list(record_id = c(1, 3, 5, 6, 7, 8, 9, 11, 12, 13, 
14, 15, 16, 17, 18, 19, 20), know_1 = c(2, 1, 1, 1, 2, 3, NA, 
1, NA, 4, 1, 2, 2, 3, 3, 4, 4), know_2 = c(1, 3, 3, 3, 2, 3, 
NA, 1, NA, 3, 3, 2, 3, 3, 4, 4, 4), know_3 = c(3, 4, 3, 3, 2, 
3, NA, 2, NA, 3, 4, 3, 3, 4, 4, 4, 4), beh_reg4 = c(1, 2, 3, 
3, 1, 3, NA, 1, NA, 3, 3, 2, 3, 3, 4, 3, 3), beh_reg5 = c(1, 
4, 3, 3, 2, 3, NA, 1, NA, 3, 3, 4, 3, 3, 3, 3, 4), beh_reg6 = c(0, 
1, 3, 3, 3, 3, NA, 1, NA, 3, 3, 4, 3, 3, 3, 4, 2), belief_conseq7 = c(4, 
4, 3, 3, 2, 2, NA, 3, NA, 4, 3, 3, 3, 3, 4, 2, 2), belief_conseq8 = c(3, 
4, 3, 3, 2, 3, NA, 3, NA, 1, 3, 4, 3, 3, 4, 2, 2), belief_conseq9 = c(4, 
4, 4, 3, 3, 3, NA, 4, NA, 4, 3, 3, 3, 3, 4, 4, 3), capacity10 = c(1, 
3, 3, 3, 1, 2, NA, 1, NA, 3, 3, 3, 3, 3, 4, 4, 3), capacity11 = c(3, 
2, 3, 3, 1, 4, NA, 2, NA, 3, 3, 4, 3, 3, 4, 4, 4), capacity12 = c(1, 
1, 3, 2, 1, 4, NA, 2, NA, 3, 3, 4, 4, 3, 3, 4, 3), skills13 = c(4, 
4, 1, 3, 4, 4, NA, 4, NA, 3, 3, 3, 3, 3, 2, 3, 3), skills14 = c(4, 
4, 3, 4, 4, 4, NA, 4, NA, 3, 3, 1, 3, 3, 2, 3, 3), skills15 = c(4, 
4, 3, 4, 4, 4, NA, 4, NA, 3, 3, 1, 3, 3, 2, 3, 3), environ16 = c(3, 
3, 1, 3, 1, 4, NA, 0, NA, 3, 1, 3, 3, 1, 1, 0, 3), environ17 = c(3, 
2, 1, 1, 2, 4, NA, 0, NA, 3, 2, 2, 3, 3, 0, 3, 2), environ18 = c(1, 
1, 1, 1, 2, 3, NA, 0, NA, 3, 2, 3, 5, 1, 1, 2, 1), environ19 = c(3, 
3, 1, 1, 1, 4, NA, 2, NA, 3, 2, 3, 3, 5, 1, 3, 3), role20 = c(4, 
4, 3, 3, 2, 3, NA, 3, NA, 3, 3, 3, 3, 3, 3, 3, 3), emotion21 = c(3, 
4, 3, 3, 5, 2, NA, 3, NA, 3, 4, 3, 3, 3, 4, 3, 3), memory22 = c(1, 
4, 3, 3, 0, 3, NA, 2, NA, 2, 3, 3, 3, 3, 3, 3, 3), goal23 = c(2, 
4, 3, 3, 2, 3, NA, 2, NA, 3, 4, 3, 3, 3, 3, 4, 3), reinforce24 = c(2, 
4, 2, 3, 5, 2, NA, 5, NA, 2, 2, 3, 3, 3, 3, 2, 2), reinforce25 = c(4, 
2, 3, 2, 5, 2, NA, 3, NA, 2, 2, 3, 3, 1, 2, 1, 1), optimism26 = c(3, 
4, 3, 3, 5, 2, NA, 3, NA, 2, 2, 3, 3, 3, 3, 3, 2), change27 = c(4, 
4, 3, 4, 3, 4, NA, 3, NA, 4, 4, 4, 4, 3, 3, 4, 4), change28 = c(4, 
3, 3, 2, 3, 4, NA, 5, NA, 4, 5, 4, 3, 3, 3, 4, 5), change29 = c(3, 
3, 3, 3, 3, 4, NA, 1, NA, 4, 5, 4, 3, 2, 1, 4, 5), change30 = c(3, 
3, 2, 2, 3, 4, NA, 2, NA, 2, 3, 3, 2, 3, 1, 2, 2), change31 = c(3, 
3, 3, 3, 3, 4, NA, 3, NA, 3, 4, 3, 3, 3, 3, 3, 3), change32 = c(3, 
3, 3, 2, 3, 4, NA, 3, NA, 3, 3, 4, 3, 3, 3, 3, 3)), class = c("spec_tbl_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -17L), spec = structure(list(
    cols = list(record_id = structure(list(), class = c("collector_double", 
    "collector")), know_1 = structure(list(), class = c("collector_double", 
    "collector")), know_2 = structure(list(), class = c("collector_double", 
    "collector")), know_3 = structure(list(), class = c("collector_double", 
    "collector")), beh_reg4 = structure(list(), class = c("collector_double", 
    "collector")), beh_reg5 = structure(list(), class = c("collector_double", 
    "collector")), beh_reg6 = structure(list(), class = c("collector_double", 
    "collector")), belief_conseq7 = structure(list(), class = c("collector_double", 
    "collector")), belief_conseq8 = structure(list(), class = c("collector_double", 
    "collector")), belief_conseq9 = structure(list(), class = c("collector_double", 
    "collector")), capacity10 = structure(list(), class = c("collector_double", 
    "collector")), capacity11 = structure(list(), class = c("collector_double", 
    "collector")), capacity12 = structure(list(), class = c("collector_double", 
    "collector")), skills13 = structure(list(), class = c("collector_double", 
    "collector")), skills14 = structure(list(), class = c("collector_double", 
    "collector")), skills15 = structure(list(), class = c("collector_double", 
    "collector")), environ16 = structure(list(), class = c("collector_double", 
    "collector")), environ17 = structure(list(), class = c("collector_double", 
    "collector")), environ18 = structure(list(), class = c("collector_double", 
    "collector")), environ19 = structure(list(), class = c("collector_double", 
    "collector")), role20 = structure(list(), class = c("collector_double", 
    "collector")), emotion21 = structure(list(), class = c("collector_double", 
    "collector")), memory22 = structure(list(), class = c("collector_double", 
    "collector")), goal23 = structure(list(), class = c("collector_double", 
    "collector")), reinforce24 = structure(list(), class = c("collector_double", 
    "collector")), reinforce25 = structure(list(), class = c("collector_double", 
    "collector")), optimism26 = structure(list(), class = c("collector_double", 
    "collector")), change27 = structure(list(), class = c("collector_double", 
    "collector")), change28 = structure(list(), class = c("collector_double", 
    "collector")), change29 = structure(list(), class = c("collector_double", 
    "collector")), change30 = structure(list(), class = c("collector_double", 
    "collector")), change31 = structure(list(), class = c("collector_double", 
    "collector")), change32 = structure(list(), class = c("collector_double", 
    "collector"))), default = structure(list(), class = c("collector_guess", 
    "collector")), skip = 1), class = "col_spec"))

score 0 · Answer 1 · answered Apr 28 '22 at 11:11

Follow instructions here to wrap the text of the question in the dataframe
See below
I am assuming you want the group the columns by type (eg, know1, know2, know3 all have the same color). If so, an easy way to do this would be to create a grouping column in the dataframe, and then apply your group aesthetic for the plot to the group instead of the column. I don't recommend having a color for all of the different questions, there are far too many. If you insist on this, this page gives a lot of information on colors, you'd want to set a palette with each individual color for the questions in order and use scale_fill_manual().

For the sorting, you need to define what you mean by "distributions with higher scores." According to what? Mean, median, IQR, etc? I assumed mean for purpose of demonstrating this, and calculated the mean for each question in the data frame, then use reorder() to apply that to the y axis order. I made up a simpler set of data in excel and pasted it in with the datapasta package RStudio add-in.

dat<-tibble::tribble(
  ~id,                                                ~question, ~response,
   1L,          "on a scale of 1 to 5 how did it make you feel",        3L,
   2L, "on a scale of 1 to 5 how did it meet your expectations",        3L,
   3L,          "on a scale of 1 to 5 how did it make you feel",        5L,
   4L, "on a scale of 1 to 5 how did it meet your expectations",        1L,
   5L,          "on a scale of 1 to 5 how did it make you feel",        1L,
   6L, "on a scale of 1 to 5 how did it meet your expectations",        3L,
   7L,          "on a scale of 1 to 5 how did it make you feel",        2L,
   8L, "on a scale of 1 to 5 how did it meet your expectations",        3L,
   9L,          "on a scale of 1 to 5 how did it make you feel",        1L,
  10L, "on a scale of 1 to 5 how did it meet your expectations",        3L,
  11L,          "on a scale of 1 to 5 how did it make you feel",        5L,
  12L, "on a scale of 1 to 5 how did it meet your expectations",        1L,
  13L,          "on a scale of 1 to 5 how did it make you feel",        3L,
  14L, "on a scale of 1 to 5 how did it meet your expectations",        4L,
  15L,          "on a scale of 1 to 5 how did it make you feel",        1L,
  16L, "on a scale of 1 to 5 how did it meet your expectations",        1L,
  17L,          "on a scale of 1 to 5 how did it make you feel",        5L,
  18L, "on a scale of 1 to 5 how did it meet your expectations",        5L,
  19L,          "on a scale of 1 to 5 how did it make you feel",        4L,
  20L, "on a scale of 1 to 5 how did it meet your expectations",        2L,
  21L,          "on a scale of 1 to 5 how did it make you feel",        4L,
  22L, "on a scale of 1 to 5 how did it meet your expectations",        3L,
  23L,          "on a scale of 1 to 5 how did it make you feel",        5L,
  24L, "on a scale of 1 to 5 how did it meet your expectations",        1L,
  25L,          "on a scale of 1 to 5 how did it make you feel",        3L,
  26L, "on a scale of 1 to 5 how did it meet your expectations",        2L,
  27L,          "on a scale of 1 to 5 how did it make you feel",        5L,
  28L, "on a scale of 1 to 5 how did it meet your expectations",        2L,
  29L,          "on a scale of 1 to 5 how did it make you feel",        1L,
  30L, "on a scale of 1 to 5 how did it meet your expectations",        4L
  ) %>%
  mutate(question = as.factor(question)) %>% 
  group_by(question) %>%
  mutate(avg_score = mean(response))

# default order
ggplot(dat, aes(x = response, y = question)) +
  geom_boxplot()

# sort by mean
ggplot(dat, aes(x = response, y = reorder(question, avg_score))) +
  geom_boxplot()

Boxplot spacing and reordering Y-axis variables

1 Answers1