How to creat a bar graph of microbiota data with one color for higher taxonomic rank and gradient color

Question

I have a Phyloseq object with my OTU table and TAX table. I would like to create a bar plot, at for instance family level, but families belonging to the same Phylum will be displayed with the same colour and be distinguished by a gradient of this color.

The final result should be similar to this:

Good Bar plot

I converted my phyloseq object into a dataframe using psmelt() and tried to adapt the code from this post : Stacked barplot with colour gradients for each bar

But I'm currently unable to create a correct graph.

library(phyloseq)
library(ggplot2)

df <- psmelt(GlobalPatterns)
df$group <- paste0(df$Phylum, "-", df$Family, sep = "")
colours <-ColourPalleteMulti(df, "Phylum", "Family")
ggplot(df, aes(Sample)) + 
  geom_bar(aes(fill = group), colour = "grey") +
  scale_fill_manual("Subject", values=colours, guide = "none")

Erreur : Insufficient values in manual scale. 395 needed but only 334 provided.

Thank you in advance for any help !

Edit: here the dput of the data

    dput(head(df, 10))
structure(list(OTU = c("549656", "279599", "549656", "549656", 
"360229", "331820", "94166", "331820", "329744", "189047"), Sample = c("AQC4cm", 
"LMEpi24M", "AQC7cm", "AQC1cm", "M31Tong", "M11Fcsw", "M31Tong", 
"M31Fcsw", "SLEpi20M", "TS29"), Abundance = c(1177685, 914209, 
711043, 554198, 540850, 452219, 396201, 354695, 323914, 251215
), X.SampleID = structure(c(2L, 10L, 3L, 1L, 16L, 11L, 16L, 14L, 
20L, 26L), .Label = c("AQC1cm", "AQC4cm", "AQC7cm", "CC1", "CL3", 
"Even1", "Even2", "Even3", "F21Plmr", "LMEpi24M", "M11Fcsw", 
"M11Plmr", "M11Tong", "M31Fcsw", "M31Plmr", "M31Tong", "NP2", 
"NP3", "NP5", "SLEpi20M", "SV1", "TRRsed1", "TRRsed2", "TRRsed3", 
"TS28", "TS29"), class = "factor"), Primer = structure(c(14L, 
11L, 15L, 13L, 9L, 5L, 9L, 4L, 12L, 23L), .Label = c("ILBC_01", 
"ILBC_02", "ILBC_03", "ILBC_04", "ILBC_05", "ILBC_07", "ILBC_08", 
"ILBC_09", "ILBC_10", "ILBC_11", "ILBC_13", "ILBC_15", "ILBC_16", 
"ILBC_17", "ILBC_18", "ILBC_19", "ILBC_20", "ILBC_21", "ILBC_22", 
"ILBC_23", "ILBC_24", "ILBC_25", "ILBC_26", "ILBC_27", "ILBC_28", 
"ILBC_29"), class = "factor"), Final_Barcode = structure(c(14L, 
11L, 15L, 13L, 9L, 5L, 9L, 4L, 12L, 23L), .Label = c("AACGCA", 
"AACTCG", "AACTGT", "AAGAGA", "AAGCTG", "AATCGT", "ACACAC", "ACACAT", 
"ACACGA", "ACACGG", "ACACTG", "ACAGAG", "ACAGCA", "ACAGCT", "ACAGTG", 
"ACAGTT", "ACATCA", "ACATGA", "ACATGT", "ACATTC", "ACCACA", "ACCAGA", 
"ACCAGC", "ACCGCA", "ACCTCG", "ACCTGT"), class = "factor"), Barcode_truncated_plus_T = structure(c(6L, 
10L, 8L, 25L, 19L, 9L, 19L, 20L, 14L, 16L), .Label = c("AACTGT", 
"ACAGGT", "ACAGTT", "ACATGT", "ACGATT", "AGCTGT", "ATGTGT", "CACTGT", 
"CAGCTT", "CAGTGT", "CCGTGT", "CGAGGT", "CGAGTT", "CTCTGT", "GAATGT", 
"GCTGGT", "GTGTGT", "TCATGT", "TCGTGT", "TCTCTT", "TCTGGT", "TGATGT", 
"TGCGGT", "TGCGTT", "TGCTGT", "TGTGGT"), class = "factor"), Barcode_full_length = structure(c(4L, 
7L, 3L, 13L, 26L, 8L, 26L, 21L, 2L, 11L), .Label = c("AGAGAGACAGG", 
"AGCCGACTCTG", "ATGAAGCACTG", "CAAGCTAGCTG", "CACGTGACATG", "CATCGACGAGT", 
"CATGAACAGTG", "CGACTGCAGCT", "CGAGTCACGAT", "CTAGCGTGCGT", "CTAGTCGCTGG", 
"GAACGATCATG", "GACCACTGCTG", "GATGTATGTGG", "GCATCGTCTGG", "GCCATAGTGTG", 
"GCTAAGTGATG", "GTACGCACAGT", "GTAGACATGTG", "TAGACACCGTG", "TCGACATCTCT", 
"TCGCGCAACTG", "TCTGATCGAGG", "TGACTCTGCGG", "TGCGCTGAATG", "TGTGGCTCGTG"
), class = "factor"), SampleType = structure(c(3L, 2L, 3L, 3L, 
9L, 1L, 9L, 1L, 2L, 1L), .Label = c("Feces", "Freshwater", "Freshwater (creek)", 
"Mock", "Ocean", "Sediment (estuary)", "Skin", "Soil", "Tongue"
), class = "factor"), Description = structure(c(2L, 10L, 3L, 
1L, 16L, 11L, 16L, 14L, 21L, 25L), .Label = c("Allequash Creek, 0-1cm depth", 
"Allequash Creek, 3-4 cm depth", "Allequash Creek, 6-7 cm depth", 
"Calhoun South Carolina Pine soil, pH 4.9", "Cedar Creek Minnesota, grassland, pH 6.1", 
"Even1", "Even2", "Even3", "F1, Day 1,  right palm, whole body study ", 
"Lake Mendota Minnesota, 24 meter epilimnion ", "M1, Day 1, fecal swab, whole body study ", 
"M1, Day 1, right palm, whole body study ", "M1, Day 1, tongue, whole body study ", 
"M3, Day 1, fecal swab, whole body study", "M3, Day 1, right palm, whole body study", 
"M3, Day 1, tongue, whole body study ", "Newport Pier, CA surface water, Time 1", 
"Newport Pier, CA surface water, Time 2", "Newport Pier, CA surface water, Time 3", 
"Sevilleta new Mexico, desert scrub, pH 8.3", "Sparkling Lake Wisconsin, 20 meter eplimnion", 
"Tijuana River Reserve, depth 1", "Tijuana River Reserve, depth 2", 
"Twin #1", "Twin #2"), class = "factor"), Kingdom = c("Bacteria", 
"Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria", 
"Bacteria", "Bacteria", "Bacteria"), Phylum = c("Cyanobacteria", 
"Cyanobacteria", "Cyanobacteria", "Cyanobacteria", "Proteobacteria", 
"Bacteroidetes", "Proteobacteria", "Bacteroidetes", "Actinobacteria", 
"Firmicutes"), Class = c("Chloroplast", "Nostocophycideae", "Chloroplast", 
"Chloroplast", "Betaproteobacteria", "Bacteroidia", "Gammaproteobacteria", 
"Bacteroidia", "Actinobacteria", "Clostridia"), Order = c("Stramenopiles", 
"Nostocales", "Stramenopiles", "Stramenopiles", "Neisseriales", 
"Bacteroidales", "Pasteurellales", "Bacteroidales", "Actinomycetales", 
"Clostridiales"), Family = c(NA, "Nostocaceae", NA, NA, "Neisseriaceae", 
"Bacteroidaceae", "Pasteurellaceae", "Bacteroidaceae", "ACK-M1", 
"Ruminococcaceae"), Genus = c(NA, "Dolichospermum", NA, NA, "Neisseria", 
"Bacteroides", "Haemophilus", "Bacteroides", NA, NA), Species = c(NA, 
NA, NA, NA, NA, NA, "Haemophilusparainfluenzae", NA, NA, NA), 
    group = c("Cyanobacteria-NA", "Cyanobacteria-Nostocaceae", 
    "Cyanobacteria-NA", "Cyanobacteria-NA", "Proteobacteria-Neisseriaceae", 
    "Bacteroidetes-Bacteroidaceae", "Proteobacteria-Pasteurellaceae", 
    "Bacteroidetes-Bacteroidaceae", "Actinobacteria-ACK-M1", 
    "Firmicutes-Ruminococcaceae"), group = c("Cyanobacteria-NA", 
    "Cyanobacteria-Nostocaceae", "Cyanobacteria-NA", "Cyanobacteria-NA", 
    "Proteobacteria-Neisseriaceae", "Bacteroidetes-Bacteroidaceae", 
    "Proteobacteria-Pasteurellaceae", "Bacteroidetes-Bacteroidaceae", 
    "Actinobacteria-ACK-M1", "Firmicutes-Ruminococcaceae")), row.names = c(406582L, 
241435L, 406580L, 406574L, 329873L, 300794L, 494797L, 300772L, 
298689L, 114279L), class = "data.frame")

Edit 2: We are on the good way

So, your code seems to work perfectly in term of color but I have some doubts about the values of the bar plot (the percentage for each family).

I plotted a proportional bar plot of the data with this code:

GlobalPatterns_prop = transform_sample_counts(GlobalPatterns, function(x)       100 * x/sum(x)) 
plot_bar(GlobalPatterns_prop , fill = "Phylum")

and obtained this :

If I understand well, using your method a majority of phylum and bar height should be "Others". I did the same with my data and I clearly see a difference in Phylum proportional abundance.

I have for the moment no clue on what is happening...

Welcome to SO! Please provide your `df` using `dput(df)` or `dput(head(df, 10))` if your `df` has many rows. — kmacierzanka, Jun 28 '20 at 20:00

Edward · Answer 1 · 2020-06-29T06:46:46.553

There's a few steps involved.

First, define the "Others".

phylums <- c('Proteobacteria','Bacteroidetes','Firmicutes')

df$Phylum[!df$Phylum %in% phylums] <- "Others"
df$Family[!df$Phylum %in% phylums] <- "Others"

df$Family[df$Phylum=="Proteobacteria" & 
 !df$Family %in% c('Alcaligenaceae','Enterobacteriaceae')] <- "Other Protobacteria"

df$Family[df$Phylum=="Bacteroidetes" &
 !df$Family %in% c('Bacteroidaceae','Rikenellaceae','Porphyromonadaceae')] <- "Other Bacteroidetes"

df$Family[df$Phylum=="Firmicutes" & 
 !df$Family %in% c('Lactobacillaceae','Clostridiaceae','Ruminococcaceae','Lachnospiraceae')] <- "Other Firmicutes"

Then, convert Phylum to a factor so that (1) the "Others" are placed last in the legend and (2) we can reorder the Family variable based on the underlying factor levels of Phylum and whether Family contains "Others". This ensures the colour gradients are correctly assigned.

library(forcats)
library(dplyr)
df2 <- select(df, Sample, Phylum, Family) %>%
  mutate(Phylum=factor(Phylum, levels=c(phylums, "Others")),
         Family=fct_reorder(Family, 10*as.integer(Phylum) + grepl("Others", Family))) %>%

  group_by(Family) %>%  # For this dataset only
  sample_n(100)         # Otherwise, unnecessary

The last two lines are extra that's not needed for real data, but here I've selected a sample of 100 within each Family so that the graph looks prettier. Otherwise, there are too many "Others" and in the graph, they swamp the others.

The custom function to create the colour gradients can be found in the accepted answer to this question (as you mentioned).

colours <- ColourPalleteMulti(df2, "Phylum", "Family")

Finally, instead of your group variable, we can use the Family variable so that the labelling is concise.

library(ggplot2)
ggplot(df2, aes(x=Sample, fill = Family)) + 
  geom_bar(position="fill", colour = "grey") +  # Stacked 100% barplot
  scale_fill_manual("", values=colours) +
  theme(axis.text.x=element_text(angle=90, vjust=0.5)) +  # Vertical x-axis tick labels
  scale_y_continuous(labels = scales::percent_format()) +
  labs(y="Relative abundance")

I couldn't manage to add the Phylum labels on the right of the legend. Perhaps you can add them manually.

Thanks a lot ! I succeed to reproduce your code and obtained exactly the same thing. But when I try to reproduce with my data, which are very similar, all bars are the same. The df2 object is similar : 3 characters columns - Sample, Phylum, Family. When I use `plot_bar()` on my phyloseq object, bar are correctly draw for each Sample. I cannot figured how ggplot can assign an height to each Family using the df2 object. Sorry for my lack of knowledge... — Thibault, Jun 30 '20 at 15:02
Ok, I found the solution. I let these line for my dataset but replace 100 by 36, which is the actual number of my samples: `%>% group_by(Family) %>% sample_n(100)` Thank you again for your help ! — Thibault, Jun 30 '20 at 21:09
Did you omit the `group_by(Family) %>% sample_n(100)` code? I did mention that was not needed for real data. — Edward, Jun 30 '20 at 21:09

score 1 · Answer 2 · answered Aug 12 '22 at 15:33

I have created a package called fantaxtic that creates such plots. It creates relative abundance plots with colours for a higher taxonomic level, and a gradient of each colour for a lower taxonomic level. Although it uses a slightly different method for labeling the Phyla, I think the results are very close to what you want. See an example below using GlobalPatterns from phyloseq.

devtools::install_github("gmteunisse/fantaxtic")
require("fantaxtic")
require("phyloseq")

# Load the data
data(GlobalPatterns)

# Get the most abundant phyla and the most abundant families within those phyla
top_nested <- nested_top_taxa(GlobalPatterns,
                              top_tax_level = "Phylum",
                              nested_tax_level = "Family",
                              n_top_taxa = 3, 
                              n_nested_taxa = 3)

# Plot the relative abundances at two levels.
plot_nested_bar(ps_obj = top_nested$ps_obj,
                top_level = "Phylum",
                nested_level = "Family")

relative abundances plotted with fantaxtic

score 0 · Answer 3 · answered Sep 24 '21 at 08:38

Great question and I'm really happy that there is solution to the two level coloring, great work Edward!

To add to the annotation part of your question. As a work around; you can make a seperate ggplot figure that shows the legend color and right annotations. Looking at the example figure showed I got quite close. I took this from this link.

https://coderedirect.com/questions/217402/add-annotation-and-segments-to-groups-of-legend-elements

First you want to make a dataframe listening alll your Taxonomic levels below each other. We are going to create concise x and y coordinates for both taxonomic levels and the 'Phyla brackets'. First arrange the right order and coordinates for the Family level.

coord_fam = df %>% select(Phylum, Family) %>% unique(
)  %>% ungroup()%>%mutate(x= c(rep(1,nrow(.))), y=1:nrow(.))

Now we want to calculate the top, middle and bottom of each group, so we can add the Phylum names and the Phylan brackets.

coord_phylum = coord_fam %>% group_by(Phylum) %>% summarise(x=mean(x),ymid= mean(y),
                                                           ymin=min(y), ymax=max(y))

Last you want to plot the coordinates correctly.

v=0.3
p2 = coord_fam %>% ggplot()+
  geom_point(aes(0.05,y, col= Family), size=8 )+
  scale_x_continuous(limits = c(0, 2)) +
  geom_segment(data = coord_phylum,
               aes(x = x + 0.1, xend = x + v, y= ymax, yend=ymax), col="black")+
  
  geom_segment(data = coord_phylum,
               aes(x = x + 0.1, xend = x + v, y= ymin, yend=ymin))+
  
  geom_segment(data = coord_phylum,
               aes(x = x + v, xend = x + v, y= ymin, yend=ymax))+
  
  geom_text(data = coord_phylum, aes(x = x + v+0.5, y = ymid, label = Phylum)) +
  geom_text(data = coord_fam, aes( x=0.6, y=y, label=Family, col=Family))+
  geom_text(data = coord_fam, aes( x=0.6, y=y, label=Family), alpha=0.9,col="grey50")+
  scale_colour_manual(values = colours)+
  theme_void()+theme(legend.position = "none")+ 
  scale_y_reverse()

p2

V is used to determine the length of the brackets.

When you put patch this together with the barplot, it can be a bit of a puzzle to find the right size for all of the geom_sizes, so start off small.

library(patchwork)
(p1+p1)

I hope this helps! You've probably already published your data by now, but maybe for the next manuscript.

Happy science, y'all!

How to creat a bar graph of microbiota data with one color for higher taxonomic rank and gradient color

3 Answers3