I am working on a dumbbell plot in R inspired by this post, and have two problems:
- Ordering the dumbbell plot (I've tried a strategy provided in this post)
- Present value labels in the plot in an aesthetically pleasing way.
My data set is formatted as a wide data set with 18 units with the following structure:
> head(ADHD_med_2010_2018_wide, 18)
# A tibble: 18 x 9
age gender county adhd_pr_1000_2010 adhd_pr_1000_2018 county_label adhd_2010 adhd_2018 diff
<dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <fct> <dbl> <dbl> <dbl>
1 2 [10-14] 1 [Both genders] 1 [Østfold] 32.1 24.3 Østfold 32.1 24.3 -7.80
2 2 [10-14] 1 [Both genders] 2 [Akershus] 20.6 23.0 Akershus 20.6 23 2.40
3 2 [10-14] 1 [Both genders] 3 [Oslo] 17.2 33.9 Oslo 17.2 33.9 16.7
4 2 [10-14] 1 [Both genders] 4 [Hedmark] 41.7 30.9 Hedmark 41.7 30.9 -10.8
5 2 [10-14] 1 [Both genders] 5 [Oppland] 24.9 39.0 Oppland 24.9 39 14.1
6 2 [10-14] 1 [Both genders] 6 [Buskerud] 26.7 36.8 Buskerud 26.7 36.8 10.1
7 2 [10-14] 1 [Both genders] 7 [Vestfold] 28.1 27.1 Vestfold 28.1 27 -1.10
8 2 [10-14] 1 [Both genders] 8 [Telemark] 29.2 24.7 Telemark 29.2 24.7 -4.5
9 2 [10-14] 1 [Both genders] 9 [Aust-Agder] 34.9 39.2 Aust-Agder 34.9 39.2 4.30
10 2 [10-14] 1 [Both genders] 10 [Vest-Agder] 17.4 23.8 Vest-Agder 17.4 23.8 6.40
11 2 [10-14] 1 [Both genders] 11 [Rogaland] 29.5 13.8 Rogaland 29.5 13.8 -15.7
12 2 [10-14] 1 [Both genders] 12 [Hordaland] 21.3 14.4 Hordaland 21.3 14.4 -6.90
13 2 [10-14] 1 [Both genders] 14 [Sogn og Fjordane] 21.3 39.7 Sogn og Fjordane 21.3 39.7 18.4
14 2 [10-14] 1 [Both genders] 15 [Møre og Romsdal] 27.0 18.6 Møre og Romsdal 27 18.6 -8.40
15 2 [10-14] 1 [Both genders] 18 [Nordland] 40.1 30.0 Nordland 40.1 30 -10.1
16 2 [10-14] 1 [Both genders] 19 [Troms] 25.8 33.2 Troms 25.8 33.2 7.40
17 2 [10-14] 1 [Both genders] 20 [Finnmark] 19.1 21.3 Finnmark 19.1 21.3 2.20
18 2 [10-14] 1 [Both genders] 50 [Trøndelag] 25.0 36.9 Trøndelag 25 37 12
I've tried two strategies for problem 1:
library("tidyverse")
library("ggalt")
fig2 <- ggplot(ADHD_med_2010_2018_wide, aes(x=adhd_2010, xend=adhd_2018, y=county_label, group=county_label)) +
#create a thick line between x and xend instead of using defaut
#provided by geom_dubbell
geom_segment(aes(x=adhd_2010,
xend=adhd_2018,
y=county_label,
yend=county_label),
color="#b2b2b2", size=1.5)+
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
fig2
Which gives a plot without ordering or values presented in a good way:
To correct ordering, I tried following the strategy provided in the post linked above:
library(dplyr)
ADHD_med_2010_2018_wide%>%
mutate(difference = abs(adhd_2018-adhd_2010)) %>% #creates the variable of differences
top_n(18, wt = difference) %>% # Choose the rows with top 20 difference
ggplot() +
aes(x=adhd_2010, xend=adhd_2018, y=reorder(county_label, difference),
group=county_label) + #reorder the labels by descending difference value
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
This still does not give a plot with a nice ordering, although it seem to order the difference (and there's still the issue with value labels):
Hopefully some of you may have input on these issues.
Data to copy:
> dput(head(ADHD_med_2010_2018_wide, 18))
structure(list(age = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2), label = "Age groups", labels = c(`5-9` = 1,
`10-14` = 2, `15-19` = 3, `20-24` = 4, `25-29` = 5, `30-34` = 6,
`All ages` = 7), class = "haven_labelled"), gender = structure(c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "Gender", labels = c(`Both genders` = 1,
Female = 2, Male = 3), class = "haven_labelled"), county = structure(c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19, 20, 50), labels = c(Østfold = 1,
Akershus = 2, Oslo = 3, Hedmark = 4, Oppland = 5, Buskerud = 6,
Vestfold = 7, Telemark = 8, `Aust-Agder` = 9, `Vest-Agder` = 10,
Rogaland = 11, Hordaland = 12, `Sogn og Fjordane` = 14, `Møre og Romsdal` = 15,
Nordland = 18, Troms = 19, Finnmark = 20, Trøndelag = 50, `Hele landet` = 99
), class = "haven_labelled"), adhd_pr_1000_2010 = c(32.1488990783691,
20.5894756317139, 17.2119483947754, 41.6982574462891, 24.8543014526367,
26.7194156646729, 28.1328239440918, 29.2480430603027, 34.8775291442871,
17.3759765625, 29.4698066711426, 21.340311050415, 21.3308296203613,
27.0334072113037, 40.1140670776367, 25.7862873077393, 19.1311283111572,
25.0325565338135), adhd_pr_1000_2018 = c(24.2834396362305, 23.0037822723389,
33.9068183898926, 30.8641967773438, 39.0195579528809, 36.7909698486328,
27.0642204284668, 24.6901988983154, 39.1978950500488, 23.8095245361328,
13.8218154907227, 14.4400091171265, 39.7175636291504, 18.5994052886963,
29.9642810821533, 33.1638412475586, 21.2596340179443, 36.9249382019043
), county_label = structure(18:1, .Label = c("Trøndelag", "Finnmark",
"Troms", "Nordland", "Møre og Romsdal", "Sogn og Fjordane", "Hordaland",
"Rogaland", "Vest-Agder", "Aust-Agder", "Telemark", "Vestfold",
"Buskerud", "Oppland", "Hedmark", "Oslo", "Akershus", "Østfold"
), class = "factor"), adhd_2010 = c(32.0999984741211, 20.6000003814697,
17.2000007629395, 41.7000007629395, 24.8999996185303, 26.7000007629395,
28.1000003814697, 29.2000007629395, 34.9000015258789, 17.3999996185303,
29.5, 21.2999992370605, 21.2999992370605, 27, 40.0999984741211,
25.7999992370605, 19.1000003814697, 25), adhd_2018 = c(24.2999992370605,
23, 33.9000015258789, 30.8999996185303, 39, 36.7999992370605,
27, 24.7000007629395, 39.2000007629395, 23.7999992370605, 13.8000001907349,
14.3999996185303, 39.7000007629395, 18.6000003814697, 30, 33.2000007629395,
21.2999992370605, 37), diff = c(-7.79999923706055, 2.39999961853027,
16.7000007629395, -10.8000011444092, 14.1000003814697, 10.0999984741211,
-1.10000038146973, -4.5, 4.29999923706055, 6.39999961853027,
-15.6999998092651, -6.89999961853027, 18.4000015258789, -8.39999961853027,
-10.0999984741211, 7.40000152587891, 2.19999885559082, 12)), row.names = c(NA,
-18L), class = c("tbl_df", "tbl", "data.frame"))