0

I am a research student coming to grips with R for the first time. I am trying to make a PCA plot from a series of body measurements, the specimens names and a subspecies tag (BIN) are in sperate columns. The BIN column contains the BIN ID for each sample. The difficulty I am facing is filtering out individuals with certain BIN's. My desired output is to produce a PCA plot identical to the one below but only displaying the named BIN's ("ACZ5516", "ADF3772") and not the remaining BIN's.

Revised image

#import data set
Anotylus<-read.csv("DataSO.csv", header = TRUE, sep = ",", 
row.names = 1)
#row.names sets specimen ID as specimen name

#set BIN as factor
Anotylus$BIN<-as.factor(Anotylus$BIN)

# Number of BINs and number of individuals in each
table(Anotylus["BIN"])

#create PCA of data set, excludes column for BIN (column 12)
Ano.pca<-PCA(Anotylus[,c(1:11)], graph = FALSE)

#visualise PCA with all individuals in the d.f.
fviz_pca_ind(Ano.pca,
         geom.ind = "point",
         col.ind = Anotylus$BIN,
         repel = TRUE,
         legend.title = "BIN",
         addEllipses = TRUE)


#With individuals from selected BINs 

top<-list(name=c("ACZ5516", "ADF3772"))

fviz_pca_ind(Ano.pca,
         geom.ind = "point",
         col.ind = Anotylus$BIN,#
         select.ind = top,
         repel = TRUE,
         legend.title = "BIN",
         addEllipses = TRUE)
#no samples visible at all
#wouild like to see only the two named

I have tried using a subset of the data but the Principal Components variation changes and produces different a result.

How do I filter the individuals displayed to a curated list?

Any advice or guidance is deeply appreciated!

Best,

Dante

Sample data set below

> dput(Anotylus)
structure(list(Total.Anten.Length..mm. = c(0.66, 0.635, 0.676, 
0.559, 1.249, 0.675, 0.704, 0.649, 0.661, 0.795, 0.836, 0.888, 
0.941, 0.781, 0.899, 0.918, 0.854, 0.834, 0.888, 0.884, 0.879, 
0.776, 0.954, 0.853, 0.96, 0.527, 0.515, 0.653, 0.491, 0.474, 
0.538, 0.694, 1.01, 0.53, 0.641, 0.509, 0.918, 0.849, 0.452, 
0.536), Body.Length...mm. = c(1.842, 1.664, 1.901, 1.917, 3.061, 
1.961, 1.862, 1.99, 1.85, 1.449, 2.455, 2.077, 2.578, 2.478, 
2.798, 2.589, 2.291, 2.882, 2.472, 2.55, 2.53, 2.757, 2.689, 
2.166, 2.894, 1.944, 1.48, 2.385, 1.715, 1.674, 1.532, 2.27, 
2.598, 1.677, 1.67, 1.68, 2.374, 2.877, 1.699, 1.656), 
Eye.Area..mm2. = c(0.01, 
0.009, 0.01, 0.006, 0.026, 0.007, 0.01, 0.01, 0.009, 0.006, 0.016, 
0.014, 0.015, 0.018, 0.02, 0.016, 0.019, 0.015, 0.013, 0.011, 
0.015, 0.014, 0.017, 0.014, 0.012, 0.007, 0.006, 0.02, 0.007, 
0.006, 0.005, 0.013, 0.013, 0.006, 0.007, 0.005, 0.013, 0.006, 
0.008, 0.005), Eye.Width..mm. = c(0.046, 0.036, 0.054, 0.033, 
0.071, 0.04, 0.046, 0.047, 0.044, 0.05, 0.059, 0.053, 0.073, 
0.063, 0.068, 0.051, 0.044, 0.07, 0.064, 0.061, 0.054, 0.042, 
0.038, 0.059, 0.059, 0.043, 0.046, 0.079, 0.037, 0.035, 0.037, 
0.054, 0.047, 0.045, 0.045, 0.028, 0.05, 0.037, 0.043, 0.045), 
Head.Width..mm. = c(0.359, 0.362, 0.377, 0.317, 0.731, 0.456, 
0.38, 0.414, 0.359, 0.453, 0.568, 0.449, 0.519, 0.517, 0.516, 
0.515, 0.512, 0.513, 0.511, 0.456, 0.503, 0.474, 0.598, 0.453, 
0.574, 0.309, 0.306, 0.574, 0.314, 0.298, 0.295, 0.386, 0.557, 
0.289, 0.318, 0.306, 0.505, 0.291, 0.298, 0.263), 
Pronotum.Width..mm. = c(0.413, 
0.455, 0.439, 0.352, 0.741, 0.462, 0.467, 0.461, 0.442, 0.493, 
0.573, 0.549, 0.584, 0.617, 0.632, 0.61, 0.614, 0.624, 0.631, 
0.533, 0.587, 0.562, 0.609, 0.522, 0.621, 0.342, 0.341, 0.598, 
0.336, 0.314, 0.331, 0.467, 0.547, 0.343, 0.342, 0.317, 0.545, 
0.328, 0.329, 0.284), Pronotum.Length..mm. = c(0.304, 0.326, 
0.334, 0.24, 0.48, 0.317, 0.303, 0.329, 0.302, 0.36, 0.418, 
0.383, 0.424, 0.428, 0.399, 0.442, 0.404, 0.461, 0.435, 0.376, 
0.393, 0.403, 0.373, 0.41, 0.435, 0.259, 0.247, 0.403, 0.257, 
0.252, 0.23, 0.387, 0.388, 0.248, 0.26, 0.215, 0.336, 0.223, 
0.231, 0.247), Elytra.Width..mm. = c(0.558, 0.552, 0.586, 
0.43, 0.854, 0.506, 0.528, 0.586, 0.548, 0.54, 0.75, 0.716, 
0.794, 0.816, 0.746, 0.82, 0.786, 0.8, 0.722, 0.69, 0.758, 
0.766, 0.736, 0.668, 0.852, 0.468, 0.462, 0.741, 0.461, 0.323, 
0.406, 0.637, 0.617, 0.41, 0.366, 0.422, 0.718, 0.42, 0.408, 
0.278), Elytra.Length..mm. = c(0.469, 0.437, 0.386, 0.346, 
0.631, 0.428, 0.464, 0.451, 0.445, 0.532, 0.583, 0.543, 0.558, 
0.62, 0.625, 0.623, 0.613, 0.605, 0.623, 0.588, 0.606, 0.48, 
0.568, 0.568, 0.598, 0.373, 0.352, 0.516, 0.365, 0.326, 0.327, 
0.502, 0.464, 0.346, 0.344, 0.319, 0.519, 0.346, 0.329, 0.346
), Pronotum.Value = c(0.288, 0.319, 0.306, 0.331, 0.179, 
0.278, 0.224, 0.211, 0.204, 0.273, 0.26, 0.33, 0.241, 0.218, 
0.203, 0.209, 0.241, 0.227, 0.31, 0.236, 0.341, 0.288, 0.283, 
0.263, 0.279, 0.173, 0.162, 0.22, 0.183, 0.209, 0.193, 0.185, 
0.236, 0.181, 0.172, 0.227, 0.275, 0.164, 0.21, 0.217), 
Elytra.Value = c(0.314, 
0.319, 0.393, 0.243, 0.205, 0.297, 0.21, 0.205, 0.244, 0.359, 
0.288, 0.335, 0.375, 0.291, 0.243, 0.238, 0.288, 0.283, 0.351, 
0.271, 0.48, 0.415, 0.325, 0.294, 0.193, 0.182, 0.271, 0.237, 
0.216, 0.246, 0.214, 0.193, 0.233, 0.205, 0.18, 0.262, 0.225, 
0.176, 0.303, 0.251), BIN = structure(c(1L, 1L, 1L, 3L, 8L, 
1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 7L, 3L, 3L, 6L, 3L, 3L, 3L, 2L, 5L, 3L, 3L, 
3L, 5L, 3L, 3L, 3L), .Label = c("ACZ5516", "ACZ5742", "ADF3772", 
"ADF4138", "ADG1201", "ADH9095", "ADI3175", "ADR2790"), class = 
"factor")), row.names = c("CCDB-22214-D03", 
"CCDB-22214-D06", "CCDB-22214-D08", "CCDB-22214-G09", "CCDB-22214- 
H02", 
"CCDB-22214-H09", "CCDB-22215-A11", "CCDB-22215-A12", "CCDB-22215- 
F04", 
"CCDB-23850-B07", "CCDB-23851-C04", "CCDB-23851-C05", "CCDB-23851- 
C11", 
"CCDB-23851-C12", "CCDB-23851-D02", "CCDB-23851-D03", "CCDB-23851- 
D04", 
"CCDB-23851-D06", "CCDB-23851-E08", "CCDB-23851-E09", "CCDB-23851- 
E11", 
"CCDB-23851-F03", "CCDB-23851-G05", "CCDB-23851-G09", "CCDB-23858- 
B08", 
"CCDB-23858-G12", "CCDB-23858-H01", "CCDB-23859-B10", "CCDB-23859- 
E07", 
"CCDB-23859-E10", "CCDB-23859-E11", "CCDB-25504-E04", "CCDB-25505- 
E02", 
"CCDB-25510-B12", "CCDB-25510-D02", "CCDB-25510-E09", "CCDB-25511- 
B06", 
"CCDB-25511-B12", "CCDB-25511-E11", "CCDB-25512-E12"), class = 
"data.frame")
  • 1
    Welcome to SO! Please provide a [minimal reproducible example with appropriate dataset](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) to help us reproduce what you have done so far. Also, it would be great if you can clarify what is your desired output, as your text asked for 4 bins, while the illustration provided 17 bins. Can also assume that each subspecies is one column for your PCA (BIN)? – Adam Quek Jul 18 '22 at 12:14
  • Hi Adam, I have edited the question to include my exact code and a sample data set. Hopefully it is clearer both on the code and want I would like to achieve. I have reduced the data set to make it easier on the forum. The objective remains the same however, to filter out individuals from the figure whilst maintaining the shape and variation contained within the Principal components. – D.Underwood Jul 19 '22 at 13:53

1 Answers1

0

Apparently factoextra "produces ggplot2-based elegant data visualization with less typing". From what I can tell, fviz_pca_ind is essentially plotting the PCA coordinate for each individual point, and compute a multivariate normal distribution as an ellipse.

Here's the replication of the plot you have attached in stripped down ggplot code:

#constructing a plotting data frame with the BIN identifier and each pca qualitative coordinates
df <- cbind.data.frame(BIN = Anotylus$BIN, Ano.pca$ind$coord)

ggplot(df, aes(x=Dim.1, y=Dim.2, color=BIN)) + 
  geom_point() + 
  stat_ellipse(type="norm")

enter image description here

Note that as there are only 1 or 2 points for all BIN other than ACZ5516 and ADF3772, there will be "Too few points to calculate an ellipse" and as such no ellipse is plotted.

In order to "hide" the other BIN in your figure, you can either just plot the BIN you wanted or you can create a new grouping (ACZ5516, ADF3772 and others) in the plotting data and set the points you do not want to focus on in less visible colour.

library(dplyr)

# Plot only BIN ACZ5516 and ADF3772

df %>% 
  filter(BIN %in% c("ACZ5516", "ADF3772")) %>%
  ggplot(aes(x=Dim.1, y=Dim.2, color=BIN)) + 
  geom_point() + 
  stat_ellipse(type="norm")

enter image description here

# Create a new grouping for BIN other than ACZ5516 and ADF3772
df2 <- df %>% 
  mutate(BIN = ifelse(BIN %in% c("ACZ5516", "ADF3772"), as.character(BIN), "Others"))

df2 %>% 
  ggplot(aes(x=Dim.1, y=Dim.2, color=BIN)) + 
  geom_point() + 
  stat_ellipse(data = df %>% filter(BIN %in% c("ACZ5516", "ADF3772")), type="norm") +
  scale_colour_manual(values = c("darkgreen", "orange", "gray"))

enter image description here

Adam Quek
  • 6,973
  • 1
  • 17
  • 23
  • Adam, I cannot thank you enough for this. Thank you for a clear and helpful walkthrough. This has made one very happy and grateful postgraduate student! – D.Underwood Jul 20 '22 at 14:58