1

Here is my dataframe;

df <- structure(list(variable = c("B.Al", "OA.P", "NDVI_10", "OA.Al", 
"tpi2000.MEAN", "solar_rad_total_20m", "B.Al", "TAS_mean.MEAN", 
"solar_rad_total_20m", "OA.pH", "tpi25.MEAN", "twi_dd.MEAN", 
"B.Al", "B.Ca", "TAS_slope.MEAN", "B.Ca", "NDWI_10", "TAS_slope.MEAN", 
"B.Ca", "OA.Ca", "TAS_slope.MEAN", "OA.Al", "B.Ca", "twi_dd.MEAN", 
"TAS_mean.MEAN", "tpi2000.MEAN", "twi_dd.STD", "OA.Ca", "OA.pH", 
"TAS_mean.MEAN", "OA.Ca", "tpi25.MEAN", "solar_rad_total_20m", 
"NDVI_10", "twi_dd.MEAN", "twi_dd.STD", "B.Ca", "B.Na", "tpi2000.MEAN", 
"OA.Na", "NDVI_10", "TAS_slope.MEAN", "B.Al", "tpi2000.MEAN", 
"NDVI_10", "TAS_mean.MEAN", "TAS_northness.MEAN", "solar_rad_total_20m", 
"OA.P", "TAS_mean.MEAN", "tpi2000.MEAN", "OA.Na", "OA.P", "NDVI_10", 
"B.Ca", "TAS_northness.MEAN", "tpi2000.MEAN", "OA.Al", "B.C_N", 
"TAS_mean.MEAN", "OA.Na", "tpi2000.MEAN", "twi_dd.MEAN", "OA.P", 
"OA.pH", "NDWI_10", "B.Ca", "OA.depth", "TAS_slope.MEAN", "OA.Al", 
"OA.Ca", "NDVI_10", "OA.Na", "OA.depth", "tpi25.MEAN", "B.Na", 
"TAS_slope.MEAN", "NDWI_10", "B.Ca", "OA.Na", "NDWI_10", "TAS_slope.MEAN", 
"OA.P", "twi_dd.MEAN", "B.P", "B.C", "twi_dd.STD", "OA.Na", "OA.P", 
"twi_dd.STD", "B.Al", "MCARI_MTVI", "TAS_mean.MEAN", "B.Al", 
"B.Ca", "B.P", "B.C_N", "TAS_slope.MEAN", "twi_dd.MEAN", "OA.Al", 
"TAS_mean.MEAN", "tpi2000.MEAN", "B.Al", "B.Ca", "NDWI_10", "B.Al", 
"B.Na", "tpi2000.MEAN", "OA.depth", "TAS_mean.MEAN", "TAS_northness.MEAN", 
"B.C_N", "TAS_mean.MEAN", "NDWI_10", "B.Na", "TAS_slope.MEAN", 
"twi_dd.STD", "B.Ca", "TAS_mean.MEAN", "NDWI_10"), variable_importance = c(0.0583456, 
0.0572622, 0.7949162, 0.145154, 0.1965898, 0.631507, 0.0319048, 
0.9834534, 0.0105422, 0.07857, 0.3157312, 0.403983, 0.095685, 
0.8925714, 0.0548878, 0.5588186, 0.0733602, 0.526027, 0.9339486, 
0.2531884, 0.048884, 0.123377, 0.6073132, 0.2345292, 0.66771, 
0.21304, 0.0367912, 0.2241128, 0.2298776, 0.5071346, 0.259179, 
0.6296734, 0.1123266, 0.3318268, 0.1044384, 0.5294008, 0.4846202, 
0.0590374, 0.4674416, 0.2007248, 0.2541912, 0.3864322, 0.1323852, 
0.3674916, 0.6370222, 0.9318416, 0.0174854, 0.0552058, 0.1484992, 
0.7697134, 0.213332, 0.2768872, 0.1104194, 0.612905, 0.8139634, 
0.0905556, 0.0680632, 0.071293, 0.1307058, 0.7604958, 0.2264404, 
0.4453206, 0.3187728, 0.4391702, 0.1647728, 0.396783, 0.8386238, 
0.1099, 0.1145692, 0.9350212, 0.0378414, 0.0344502, 0.2856692, 
0.260972, 0.4203974, 0.114788, 0.675816, 0.173443, 0.934695, 
0.0927296, 0.0231832, 0.7535372, 0.1556188, 0.0567598, 0.1814224, 
0.1409008, 0.6511174, 0.565503, 0.4724184, 0.0136072, 0.3129622, 
0.129463, 0.4748478, 0.2831364, 0.6665722, 0.0824932, 0.0504342, 
0.9397376, 0.0456134, 0.233926, 0.567116, 0.1146926, 0.4097234, 
0.2708894, 0.2494134, 0.1986246, 0.1362926, 0.7396076, 0.0500134, 
0.9081236, 0.0989256, 0.0865266, 0.7041882, 0.1453008, 0.1198452, 
0.8600394, 0.1203448, 0.4742014, 0.2307082, 0.1815164), variable_short = c("B.Al", 
"OA.P", "NDVI", "OA.Al", "tpi2000", "solar_rad", "B.Al", "elev.", 
"solar_rad", "OA.pH", "tpi25", "twi", "B.Al", "B.Ca", "slope", 
"B.Ca", "NDWI", "slope", "B.Ca", "OA.Ca", "slope", "OA.Al", "B.Ca", 
"twi", "elev.", "tpi2000", "twi_st.d", "OA.Ca", "OA.pH", "elev.", 
"OA.Ca", "tpi25", "solar_rad", "NDVI", "twi", "twi_st.d", "B.Ca", 
"B.Na", "tpi2000", "OA.Na", "NDVI", "slope", "B.Al", "tpi2000", 
"NDVI", "elev.", "northness", "solar_rad", "OA.P", "elev.", "tpi2000", 
"OA.Na", "OA.P", "NDVI", "B.Ca", "northness", "tpi2000", "OA.Al", 
"B.C_N", "elev.", "OA.Na", "tpi2000", "twi", "OA.P", "OA.pH", 
"NDWI", "B.Ca", "OA.depth", "slope", "OA.Al", "OA.Ca", "NDVI", 
"OA.Na", "OA.depth", "tpi25", "B.Na", "slope", "NDWI", "B.Ca", 
"OA.Na", "NDWI", "slope", "OA.P", "twi", "B.P", "B.C", "twi_st.d", 
"OA.Na", "OA.P", "twi_st.d", "B.Al", "MCARI_MTVI", "elev.", "B.Al", 
"B.Ca", "B.P", "B.C_N", "slope", "twi", "OA.Al", "elev.", "tpi2000", 
"B.Al", "B.Ca", "NDWI", "B.Al", "B.Na", "tpi2000", "OA.depth", 
"elev.", "northness", "B.C_N", "elev.", "NDWI", "B.Na", "slope", 
"twi_st.d", "B.Ca", "elev.", "NDWI"), class = c("soil", "soil", 
"spectral", "soil", "topo", "topo", "soil", "topo", "topo", "soil", 
"topo", "topo", "soil", "soil", "topo", "soil", "spectral", "topo", 
"soil", "soil", "topo", "soil", "soil", "topo", "topo", "topo", 
"topo", "soil", "soil", "topo", "soil", "topo", "topo", "spectral", 
"topo", "topo", "soil", "soil", "topo", "soil", "spectral", "topo", 
"soil", "topo", "spectral", "topo", "topo", "topo", "soil", "topo", 
"topo", "soil", "soil", "spectral", "soil", "topo", "topo", "soil", 
"soil", "topo", "soil", "topo", "topo", "soil", "soil", "spectral", 
"soil", "soil", "topo", "soil", "soil", "spectral", "soil", "soil", 
"topo", "soil", "topo", "spectral", "soil", "soil", "spectral", 
"topo", "soil", "topo", "soil", "soil", "topo", "soil", "soil", 
"topo", "soil", "spectral", "topo", "soil", "soil", "soil", "soil", 
"topo", "topo", "soil", "topo", "topo", "soil", "soil", "spectral", 
"soil", "soil", "topo", "soil", "topo", "topo", "soil", "topo", 
"spectral", "soil", "topo", "topo", "soil", "topo", "spectral"
)), row.names = c(NA, -120L), class = "data.frame")

This is the code for the plot I am currently making;

ggplot(df, aes(x = variable_short, y = variable_importance)) + 
  geom_boxplot(aes(colour = class), outlier.colour = NA) + 
  geom_point(position = position_dodge(width = 0.75), aes(group = class))

Here is the plot;

enter image description here

I want the x axis to be organized so that the variable classes are together (ie. red is all together, blue is together, and green is together) rather than be alphabetically organized.

I found great solutions for organizing the x axis based on a numerical value using reorder within the aes function (Reorder bars in geom_bar ggplot2), but was surprised to not find good documentation on SO about how to organize the x axis by category. I tried applying reorder for organizing the x axis by group but was unsuccessful.

I realize that I can use something like the following (credit: Order Bars in ggplot2 bar graph, Gavin Simpson answer);

df$variable_short <- factor(df$variable_short, levels = c("...."))

but it seems like there should be a much more elegant solution.

Can reorder be used for groups or is there a similarly elegant solution?

nateroe
  • 487
  • 3
  • 20
  • Blue, red, and green together makes sense and is easy. Do you care about the order *within* each of the color groups? Specific order? Alphabetical order? Descending by mean, ascending by maximum.... – Gregor Thomas Oct 05 '20 at 15:35
  • I would love to see how to do descending by mean within the groups. Thanks! – nateroe Oct 05 '20 at 15:37
  • 1
    See also the [R-FAQ](https://stackoverflow.com/q/5208679/903061) on the issue, which has more answers and more methods for various situations. – Gregor Thomas Oct 05 '20 at 15:37
  • @GregorThomas, thank you for the link. It is one of the links I shared also. I did not see a solution specific to this scenario in that link, but it has several good answers for similar situations as you stated. – nateroe Oct 05 '20 at 15:49
  • With `factor(..., level = my_order)` you can set the order of the levels (and thus the order of the axis) in any order you care to write/calculate - this it the general solution used in different ways in each of the top 3 answers at the FAQ (and in the second option in my answer here). But yes, there's no specific example there with groups like you have here. – Gregor Thomas Oct 05 '20 at 15:51

2 Answers2

2

With this solution you can apply a double reordering without a dplyr pipeline.

ggplot(df, aes(x = reorder(reorder(variable_short, -variable_importance, mean), as.numeric(factor(class))),
               y = variable_importance)) + 
 geom_boxplot(aes(colour = class), outlier.colour = NA) + 
 geom_point(aes(group = class), position = position_dodge(width = 0.75))

enter image description here


The median may be more visually appealing, because the middle lines of the boxes are in order.

ggplot(df, aes(x = reorder(reorder(variable_short, -variable_importance, median), as.numeric(factor(class))),
               y = variable_importance)) + 
 geom_boxplot(aes(colour = class), outlier.colour = NA) + 
 geom_point(aes(group = class), position = position_dodge(width = 0.75))

enter image description here

Edo
  • 7,567
  • 2
  • 9
  • 19
2

We can use reorder to order the x-axis variable_short factor by the negative mean of the y-axis variable_importance, and use faceting to get the colors together. This also adds nice facet labels so you could potentially remove the legend:

df$variable_short = reorder(df$variable_short, -df$variable_importance)
ggplot(df, aes(x = variable_short, y = variable_importance)) + 
  geom_boxplot(aes(colour = class), outlier.colour = NA) + 
  geom_point(position = position_dodge(width = 0.75), aes(group = class)) +
  facet_grid(~class, scales = "free_x", space = "free_x")

enter image description here


If you don't want the facets, we can calculate the correct order and use the levels argument of factor() to set it:

library(dplyr)
var_order = df %>% 
  group_by(class, variable_short) %>%
  summarize(var_mean = mean(variable_importance)) %>%
  ungroup() %>%
  arrange(class, desc(var_mean)) %>%
  pull(variable_short)

df$variable_short = factor(df$variable_short, levels = var_order)
ggplot(df, aes(x = variable_short, y = variable_importance)) + 
  geom_boxplot(aes(colour = class), outlier.colour = NA) + 
  geom_point(position = position_dodge(width = 0.75), aes(group = class))

enter image description here

Gregor Thomas
  • 136,190
  • 20
  • 167
  • 294