0

I think I don't know exactly how ggplot2 legends work, but I have these few lines of code and I can't make him show one.

Here is the dataset:

dati <-
structure(list(quinquennio = c("1995-2000", "1996-2001", "1997-2002", 
"1998-2003", "1999-2004", "2000-2005", "2001-2006", "2002-2007", 
"2003-2008", "2004-2009", "2005-2010", "2006-2011", "2007-2012", 
"2008-2013", "2009-2014", "2010-2015", "2011-2016", "2012-2017", 
"2013-2018"), primo_anno = c(588402L, 586231L, 576434L, 562444L, 
585496L, 585351L, 593010L, 617309L, 620897L, 613388L, 616645L, 
627166L, 618343L, 604995L, 597915L, 598747L, 614302L, 610468L, 
612675L), quinto_anno = c(372728L, 380211L, 387806L, 393974L, 
401984L, 394144L, 396725L, 413596L, 417736L, 424143L, 426651L, 
431424L, 427015L, 425553L, 430832L, 435158L, 452568L, 456038L, 
461120L), quinquennio_ok = c("1995\n2000", "1996\n2001", "1997\n2002", 
"1998\n2003", "1999\n2004", "2000\n2005", "2001\n2006", "2002\n2007", 
"2003\n2008", "2004\n2009", "2005\n2010", "2006\n2011", "2007\n2012", 
"2008\n2013", "2009\n2014", "2010\n2015", "2011\n2016", "2012\n2017", 
"2013\n2018"), primo_anno_label = c("588k", "586k", "576k", "562k", 
"585k", "585k", "593k", "617k", "620k", "613k", "616k", "627k", 
"618k", "604k", "597k", "598k", "614k", "610k", "612k"), quinto_anno_label = c("372k", 
"380k", "387k", "393k", "401k", "394k", "396k", "413k", "417k", 
"424k", "426k", "431k", "427k", "425k", "430k", "435k", "452k", 
"456k", "461k")), .Names = c("quinquennio", "primo_anno", "quinto_anno", 
"quinquennio_ok", "primo_anno_label", "quinto_anno_label"), row.names = c(NA, 
-19L), spec = structure(list(cols = structure(list(quinquennio = structure(list(), class = c("collector_character", 
"collector")), primo_anno = structure(list(), class = c("collector_integer", 
"collector")), quinto_anno = structure(list(), class = c("collector_integer", 
"collector"))), .Names = c("quinquennio", "primo_anno", "quinto_anno"
)), default = structure(list(), class = c("collector_guess", 
"collector"))), .Names = c("cols", "default"), class = "col_spec"), class = c("tbl_df", 
"tbl", "data.frame"))

And here is the code:

ggplot(dati) +
  geom_text(aes(x=quinquennio_ok, y=primo_anno, label=primo_anno_label, vjust=-1.1), color="dark blue") +
  geom_text(aes(x=quinquennio_ok, y=quinto_anno, label=quinto_anno_label, vjust=2), color="dark red") +
  geom_segment(
    aes(x=quinquennio_ok,
        y=primo_anno-4000,
        xend=quinquennio_ok,
        yend=quinto_anno+10000),
    colour="dark blue", size=1, alpha=.4) +
  geom_point(aes(x=quinquennio_ok, 
                 y=primo_anno),
             size=4, alpha=.5, color="dark blue", show.legend = TRUE) +
  geom_point(aes(x=quinquennio_ok,
                 y=quinto_anno+8000),
             size=3, alpha=.5, fill="dark blue", colour="dark blue", shape=25) +
  geom_point(aes(x=quinquennio_ok,
                 y=quinto_anno),
             size=3, alpha=.5, color="dark red", show.legend = TRUE) +
  theme_minimal() +
  theme(legend.position = c(.5,.5),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line.y = element_line(color="light grey"),
        panel.grid.minor.y = element_line(color="light grey")) +
  scale_y_continuous(breaks=seq(300000,700000,50000), limits=c(350000,650000),
                     labels = scales::unit_format(unit="k",scale=.001,sep="")) +
  labs(x="Quinquenni",
       y="Studenti iscritti",
       title="Dispersione scolastica in Italia",
       subtitle="Dal 1995 al 2018",
       caption="Fonte: Report TuttoScuola 2018")

This is what I get:

enter image description here

I'd just like to have a legend to explain that the blue dots are the number of students enrolled on the first year of the five-year period, and the red ones are the students still enrolled on the last year.

If I'm way too far from the solution, I'd appreciate some reference links to study more about ggplot2 and legends.

Michael Harper
  • 14,721
  • 2
  • 60
  • 84
Federico
  • 3
  • 3
  • Thanks for sharing your code, could you add a small example dataset as well? – kath Sep 20 '18 at 08:20
  • Possible duplicate of [Add legend to manually added lines using ggplot](https://stackoverflow.com/questions/26204758/add-legend-to-manually-added-lines-using-ggplot) – kath Sep 20 '18 at 08:26
  • Added head of the dataset, thanks @kath – Federico Sep 20 '18 at 08:29
  • Please share sample of your data using `dput()` (not `str` or `head` or picture/screenshot) so others can help. See more here https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example?rq=1 – Tung Sep 20 '18 at 08:37
  • @kath I'm trying to apply the other post's solution, but it doesn't seem to work (or I'm doing something wrong, which is very likely) – Federico Sep 20 '18 at 09:03
  • @Federico I hope my answer helps to clarify how to adapt this to your case (see the second part) – kath Sep 20 '18 at 10:51

1 Answers1

0

So there are several options to solves this and I'll give you two. One option is to transform your data from wide format to long format, so that you have one row for each observation:

library(tidyverse)

dati_long <- dati %>% 
  select(quinquennio, quinquennio_ok, primo_anno, quinto_anno) %>% 
  gather(year, value, primo_anno, quinto_anno) %>% 
  mutate(label = paste0(floor(value/1000), "k"))

dati_long
# A tibble: 38 x 5
#    quinquennio quinquennio_ok year        value label
#    <chr>       <chr>          <chr>       <int> <chr>
#  1 1995-2000   "1995\n2000"   primo_anno 588402 588k 
#  2 1996-2001   "1996\n2001"   primo_anno 586231 586k 
#  3 1997-2002   "1997\n2002"   primo_anno 576434 576k 
#  4 1998-2003   "1998\n2003"   primo_anno 562444 562k 
#  5 1999-2004   "1999\n2004"   primo_anno 585496 585k 
#  6 2000-2005   "2000\n2005"   primo_anno 585351 585k 
#  7 2001-2006   "2001\n2006"   primo_anno 593010 593k 
#  8 2002-2007   "2002\n2007"   primo_anno 617309 617k 
#  9 2003-2008   "2003\n2008"   primo_anno 620897 620k 
# 10 2004-2009   "2004\n2009"   primo_anno 613388 613k 
# ... with 28 more rows

Then you can assign the color depending on the year in your ggplot:

ggplot(dati_long, aes(x = quinquennio_ok, y = value, label = label, color = year)) +
  geom_point(size = 3) +
  geom_text(aes(vjust = ifelse(year == "primo_anno", -1.1, 2)), show.legend = F) +
  geom_line(aes(group = quinquennio_ok, 
                y = ifelse(year == "primo_anno", value - 4000, value + 10000)), 
            color = "dark blue", size = 1, alpha = .4) +
  geom_point(data = . %>% filter(year == "quinto_anno"), 
             mapping = aes(x = quinquennio_ok, y = value + 8000),
             size = 3, alpha = .5, fill = "dark blue", colour = "dark blue",
             shape = 25)  +
  scale_color_manual(values = c("dark blue","dark red")) +
  theme_minimal() +
  theme(legend.position = c(.5, .5),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line.y = element_line(color = "light grey"),
        panel.grid.minor.y = element_line(color = "light grey")) +
  scale_y_continuous(breaks = seq(300000, 700000, 50000),
                     limits = c(350000, 650000),
                     labels = scales::unit_format(unit = "k", scale = .001, 
                                                  sep = "")) +
  labs(x = "Quinquenni", y = "Studenti iscritti",
       title = "Dispersione scolastica in Italia",
       subtitle = "Dal 1995 al 2018",
       caption = "Fonte: Report TuttoScuola 2018")

enter image description here

The second option works without transforming your data, but by specifying a dummy aesthetics where you want to have a legend:

ggplot(dati) +
  geom_text(aes(x = quinquennio_ok, y = primo_anno, 
                label = primo_anno_label, vjust = -1.1), color = "dark blue") +
  geom_text(aes(x = quinquennio_ok, y = quinto_anno, 
                label = quinto_anno_label, vjust = 2), color = "dark red") +
  geom_segment(aes(x = quinquennio_ok, y = primo_anno - 4000, 
                   xend = quinquennio_ok, yend = quinto_anno + 10000),
               colour = "dark blue", size = 1, alpha = .4) +
  ## The color is added inside of aes and given a name as a string 
  ## which will be displayed in the legend
  geom_point(aes(x = quinquennio_ok, y = primo_anno, color = "primo"),
             size = 4, alpha = .5, show.legend = TRUE) +
  geom_point(aes(x = quinquennio_ok, y = quinto_anno+8000),
             size = 3, alpha = .5, fill = "dark blue", 
             colour = "dark blue", shape = 25) +
  ## The color is added inside of aes and given a name as a string 
  ## which will be displayed in the legend
  geom_point(aes(x = quinquennio_ok, y = quinto_anno, color = "quinto"),
             size = 3, alpha = .5, 
             show.legend = TRUE) +
  ## Here the colors are defined manually and a name for the legend is given
  scale_color_manual("year", values = c("dark blue","dark red")) +
  theme_minimal() +
  theme(legend.position = c(.5,.5),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line.y = element_line(color = "light grey"),
        panel.grid.minor.y = element_line(color = "light grey")) +
  scale_y_continuous(breaks = seq(300000,700000,50000), limits = c(350000,650000),
                     labels = scales::unit_format(unit = "k",scale = .001,sep = "")) +
  labs(x = "Quinquenni",
       y = "Studenti iscritti",
       title = "Dispersione scolastica in Italia",
       subtitle = "Dal 1995 al 2018",
       caption = "Fonte: Report TuttoScuola 2018")
kath
  • 7,624
  • 17
  • 32