0

With this code

ggplot(dfrunning,aes(x=distance/1000))+
  geom_histogram(aes(fill=catpace),binwidth=1)+
  stat_bin(binwidth=1, geom="text", colour="white", size=3.5,
           aes(label=..count.., group=catpace), position=position_stack(vjust=0.5)) +
  scale_x_continuous(breaks=seq(0,max(dfrunning$distance), 1))+
  labs(title = "Running distribution", x = "Distance in km", y = "Count", fill = "Pace in sec/km") 

I produce this image: enter image description here

I would prefer to not see the count for each bin but have every bin scaled to 100%. Then also the filling and the text should be rescaled accordingly. The data frame looks like this:

> dfrunning
# A tibble: 2,201 x 11
   date       time   type    distance duration paceInMin paceInSec latitude longitude catpace     catdistance      
   <date>     <time> <chr>      <dbl>    <dbl> <time>        <int>    <dbl>     <dbl> <fct>       <fct>            
 1 2012-04-16 10:24  running    13680     4192 05:06           306     50.8      6.10 (300,330]   (1.3e+04,1.4e+04]
 2 2012-04-18 10:47  running     7239     2115 04:52           292     50.8      6.10 (270,300]   (7e+03,8e+03]    
 3 2012-04-22 14:09  running    28536    10571 06:10           370     50.8      6.10 (360,1e+04] (2.8e+04,2.9e+04]
 4 2012-05-05 13:11  running    16168     7308 07:31           451     50.7      6.08 (360,1e+04] (1.6e+04,1.7e+04]
 5 2012-05-06 13:39  running    25033     9180 06:06           366     50.8      6.10 (360,1e+04] (2.5e+04,2.6e+04]

IMO, this question is different from "Create stacked barplot where each stack is scaled to sum to 100%" since I do not have the counts of every group explicitly.

EDIT: As pointed out by jaySf, here is the output of dput(head(dfrunning,50))

structure(list(date = structure(c(15446, 15448, 15452, 15465, 
15466, 15483, 15506, 15506, 15561, 15566, 15566, 15591, 15598, 
15599, 15602, 15605, 15606, 15608, 15611, 15612, 15613, 15614, 
15615, 15616, 15617, 15618, 15618, 15619, 15747, 15621, 15621, 
15622, 15623, 15627, 15752, 15769, 15770, 15772, 15774, 15775, 
15776, 15778, 15780, 15781, 15782, 15782, 15783, 15783, 15785, 
15785), class = "Date"), time = structure(c(37475, 38822, 50949, 
47508, 49193, 55739, 56611, 59442, 56185, 69657, 72709, 36513, 
64961, 51622, 49059, 59999, 50660, 72043, 65558, 41359, 38752, 
68144, 70312, 68611, 64509, 61189, 68135, 34764, 63827, 59209, 
69285, 69202, 69029, 41600, 31455, 61002, 61247, 58883, 47413, 
63764, 64603, 60993, 34642, 37138, 62160, 65013, 61298, 63556, 
61877, 65543), class = c("hms", "difftime"), units = "secs"), 
    type = c("running", "running", "running", "running", "running", 
    "running", "running", "running", "running", "running", "running", 
    "running", "running", "running", "running", "running", "running", 
    "running", "running", "running", "running", "running", "running", 
    "running", "running", "running", "running", "running", "running", 
    "running", "running", "running", "running", "running", "running", 
    "running", "running", "running", "running", "running", "running", 
    "running", "running", "running", "running", "running", "running", 
    "running", "running", "running"), distance = c(13680, 7238.54607310699, 
    28535.7961841139, 16168.3259995435, 25033, 10714.1336730768, 
    2363.75157921817, 8432.38615603382, 13261.9315631379, 14914.0942764589, 
    4064.49780742219, 8958.01394358889, 8416.04241820714, 27086.4898199381, 
    8454.4109033314, 13078.5350969731, 9976.05213811295, 7522.5914405498, 
    20333.7962161682, 13657.8724960625, 19632.1287324509, 19161.0107979676, 
    10175, 18363.9547260094, 45.9799995422363, 17249.8509124987, 
    19079.0453104679, 9720.46644444582, 13963.9852885433, 19564.2037933423, 
    9233.03482250782, 14047.3764062267, 17000.9929225885, 19796.3029324504, 
    7986.10636548276, 16579.832919954, 9793.81660451401, 11942.1530615798, 
    19049.4113915166, 8852.04351847768, 7891.32210952351, 5995.21855763869, 
    1301.91869595747, 13996.0654474524, 2100.09008789062, 9869.70660238926, 
    2028.71832491649, 7433.78777880617, 15151.3936450139, 11741.0316277532
    ), duration = c(4192, 2115, 10571, 7308, 9180, 3292, 760, 
    2640, 4436, 6646, 1371, 3405, 2438, 8477, 2588, 3968, 3271, 
    2826, 5652, 4330, 6410, 6255, 3682, 6193, 13, 83077, 6689, 
    3954, 4963, 106805, 2846, 6340, 5348, 6123, 3605, 88125, 
    3166, 3518, 14506, 3118, 2780, 86832, 633, 4388, 662, 2796, 
    581, 2191, 86195, 3629), paceInMin = structure(c(18360, 17520, 
    22200, 27060, 21960, 18420, 19260, 18780, 20040, 26700, 20220, 
    22800, 17340, 18720, 18360, 18180, 19620, 22500, 16620, 19020, 
    19560, 19560, 21660, 20220, 16920, 72960, 21000, 24360, 21300, 
    NA, 18480, 27060, 18840, 18540, 27060, NA, 19380, 17640, 
    45660, 21120, 21120, 4980, 29160, 18780, 18900, 16980, 17160, 
    17640, NA, 18540), class = c("hms", "difftime"), units = "secs"), 
    paceInSec = c(306L, 292L, 370L, 451L, 366L, 307L, 321L, 313L, 
    334L, 445L, 337L, 380L, 289L, 312L, 306L, 303L, 327L, 375L, 
    277L, 317L, 326L, 326L, 361L, 337L, 282L, 1216L, 350L, 406L, 
    355L, 1859L, 308L, 451L, 314L, 309L, 451L, 1715L, 323L, 294L, 
    761L, 352L, 352L, 83L, 486L, 313L, 315L, 283L, 286L, 294L, 
    2088L, 309L), latitude = c(50.78088236, 50.78210075, 50.77468025, 
    50.74850298, 50.77482007, 50.78003285, 50.78238624, 50.78864819, 
    51.33017446, 50.77988517, 50.74921084, 51.32995008, 51.32999836, 
    51.33013314, 51.32992619, 48.71562467, 48.71430603, 48.7143813, 
    48.714214, 48.71429463, 48.717048, 48.71553859, 48.7142808, 
    48.71094162, 48.71536257, 48.71526475, 48.71454718, 48.71594159, 
    48.7144186, 48.71531186, 48.71480333, 48.70992154, 48.70286641, 
    48.71461113, 48.71444383, 48.71446428, 48.7146807, 48.71469336, 
    48.72847723, 48.71530138, 48.70127678, 48.70118726, 48.7013119, 
    48.73173444, 48.71487223, 48.72272649, 48.71518764, 48.72266807, 
    48.71527171, 48.71515763), longitude = c(6.09665447, 6.09782727, 
    6.09629815, 6.08372496, 6.09631483, 6.10046044, 6.09614795, 
    6.07149736, 7.86466297, 6.10240906, 6.08444153, 7.86465433, 
    7.86348933, 7.86398814, 7.86355178, 11.48824135, 11.48822618, 
    11.48825015, 11.48808285, 11.48821939, 11.4908933, 11.49234362, 
    11.48826499, 11.49619108, 11.49016634, 11.49094141, 11.48822375, 
    11.49247371, 11.48828653, 11.48861426, 11.48820028, 11.49726229, 
    11.5088289, 11.48817371, 11.48823674, 11.48811696, 11.48788319, 
    11.48822375, 11.56643034, 11.48987599, 11.50984076, 11.50986775, 
    11.50983674, 11.56601745, 11.48822492, 11.51100417, 11.48894744, 
    11.51097458, 11.48799551, 11.48825208), catpace = c("(300,330]", 
    "(270,300]", "(360,1e+04]", "(360,1e+04]", "(360,1e+04]", 
    "(300,330]", "(300,330]", "(300,330]", "(330,360]", "(360,1e+04]", 
    "(330,360]", "(360,1e+04]", "(270,300]", "(300,330]", "(300,330]", 
    "(300,330]", "(300,330]", "(360,1e+04]", "(270,300]", "(300,330]", 
    "(300,330]", "(300,330]", "(360,1e+04]", "(330,360]", "(270,300]", 
    "(360,1e+04]", "(330,360]", "(360,1e+04]", "(330,360]", "(360,1e+04]", 
    "(300,330]", "(360,1e+04]", "(300,330]", "(300,330]", "(360,1e+04]", 
    "(360,1e+04]", "(300,330]", "(270,300]", "(360,1e+04]", "(330,360]", 
    "(330,360]", "(0,180]", "(360,1e+04]", "(300,330]", "(300,330]", 
    "(270,300]", "(270,300]", "(270,300]", "(360,1e+04]", "(300,330]"
    ), catdistance = c("(1.3e+04,1.4e+04]", "(7e+03,8e+03]", 
    "(2.8e+04,2.9e+04]", "(1.6e+04,1.7e+04]", "(2.5e+04,2.6e+04]", 
    "(1e+04,1.1e+04]", "(2e+03,3e+03]", "(8e+03,9e+03]", "(1.3e+04,1.4e+04]", 
    "(1.4e+04,1.5e+04]", "(4e+03,5e+03]", "(8e+03,9e+03]", "(8e+03,9e+03]", 
    "(2.7e+04,2.8e+04]", "(8e+03,9e+03]", "(1.3e+04,1.4e+04]", 
    "(9e+03,1e+04]", "(7e+03,8e+03]", "(2e+04,2.1e+04]", "(1.3e+04,1.4e+04]", 
    "(1.9e+04,2e+04]", "(1.9e+04,2e+04]", "(1e+04,1.1e+04]", 
    "(1.8e+04,1.9e+04]", "(0,1e+03]", "(1.7e+04,1.8e+04]", "(1.9e+04,2e+04]", 
    "(9e+03,1e+04]", "(1.3e+04,1.4e+04]", "(1.9e+04,2e+04]", 
    "(9e+03,1e+04]", "(1.4e+04,1.5e+04]", "(1.7e+04,1.8e+04]", 
    "(1.9e+04,2e+04]", "(7e+03,8e+03]", "(1.6e+04,1.7e+04]", 
    "(9e+03,1e+04]", "(1.1e+04,1.2e+04]", "(1.9e+04,2e+04]", 
    "(8e+03,9e+03]", "(7e+03,8e+03]", "(5e+03,6e+03]", "(1e+03,2e+03]", 
    "(1.3e+04,1.4e+04]", "(2e+03,3e+03]", "(9e+03,1e+04]", "(2e+03,3e+03]", 
    "(7e+03,8e+03]", "(1.5e+04,1.6e+04]", "(1.1e+04,1.2e+04]"
    )), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"
))

Which yields this plot: enter image description here

Thomas
  • 1,199
  • 1
  • 14
  • 29
  • 3
    Would you mind to post your example data rather with `dput(dfrunning)` or `head(dput(dfrunning))`? It is a pain for users intending to help you to cobble together data from such pasted outputs. For help on this, read: https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example/5963610#5963610 Cheers! – jay.sf Jul 24 '18 at 08:00
  • @jaySf, thanks! I updated the working example via `dput(head(dfrunning,50))` – Thomas Jul 24 '18 at 11:53

1 Answers1

2

This can be accomplished by using the position = "fill" argument. To achieve the correct spacing in the text, you also need to change the position argument in the stat_bin layer. This should work:

library(scales) # required for the nice percentage y-axis
ggplot(dfrunning,aes(x=distance/1000))+
  geom_histogram(aes(fill=catpace),binwidth=1, position = "fill")+
   stat_bin(binwidth=1, geom="text", colour="white", size=3.5,
            aes(label=..count.., group=catpace), position=position_fill(vjust=0.5)) +
  scale_x_continuous(breaks=seq(0,max(dfrunning$distance), 1))+
  scale_y_continuous(labels = percent(c(0, 0.25, 0.5, 0.75, 1))) +
  labs(title = "Running distribution", x = "Distance in km", y = "Percentage", fill = "Pace in sec/km")

Which creates this plot:

Plot1


EDIT: Percentage-labels instead of counts

I don't know how to get the percentages into the graph using the approach mentioned above. It is likely that this isn't possible, but equally as likely that I just don't know it. Anyway, if you want percentage labels in the graph, I recommend preprocessing your data, e.g. like this:

dfrunning %>% 
  select(distance, catpace) %>%
  mutate(dist = round(distance/1000)) %>%
  group_by(dist, catpace) %>% 
  mutate(test = n()) %>%
  distinct(dist, catpace, test) %>%
  group_by(dist) %>%
  mutate(pct = test/sum(test)*100) %>%
  ggplot(aes(x= dist, y = pct)) +
    geom_bar(aes(fill=catpace), stat = "identity") +
    geom_text(aes(label = paste0(round(pct, 0),"%")), 
              colour="white", size=3.5, angle = 90, 
              position = position_stack(vjust = 0.5)) +
  labs(title = "Running distribution", 
       x = "Distance in km", y = "Percentage", 
       fill = "Pace in sec/km")

Which gives you the following graph (I rotated the labels to make them fit using the angle argument):

Plot2

Bart VdW
  • 438
  • 8
  • 17
  • Awesome, thanks! Can you also fix the numbers inside the bars? I.e. not the numbers but the percentages? – Thomas Jul 24 '18 at 12:44
  • well I realised that you probably wanted percentages after posting the answer, but I haven't figured that out yet. if you use `label = ..count../sum(..count..)`, you get the percentage of the total, and not the percentage per bin. I'm looking into it now, but it might be that the only way to achieve this is by preprocessing the data – Bart VdW Jul 24 '18 at 12:55
  • Voila, I couldn't do it directly in the plot, so I processed the data first. Hopefully this helps – Bart VdW Jul 24 '18 at 15:29
  • Perfect! Thanks – Thomas Jul 24 '18 at 15:42