1

I am currently making an euler diagram. I managed to make an euler diagram containing 6 variables using these codes, although I believe these codes are not efficient:

dataset <- data.frame(
        A = rep(c(1, 2, NA), length.out = 100),
        B = rep(c(2, NA, 1), length.out = 100),
        C = rep(c(NA, 1, 2), length.out = 100),
        D = rep(c(NA, 2, 1), length.out = 100),
        E = rep(c(1, NA, 2), length.out = 100),
        F = rep(c(1, 2, NA), length.out = 100)) 

euler_primary <- c("A" = sum(dataset$A == 1, na.rm = TRUE),
                    "B" = sum(dataset$B == 1, na.rm = TRUE),
                    "C" = sum(dataset$C == 1, na.rm = TRUE),
                    "D" = sum(dataset$D == 1, na.rm = TRUE),
                    "E" = sum(dataset$E == 1, na.rm = TRUE),
                    "F" = sum(dataset$F == 1, na.rm = TRUE),
                    "A&B" = sum(dataset$B == 1 & dataset$A == 1, na.rm=TRUE),
                    "A&C" = sum(dataset$C == 1 & dataset$A == 1, na.rm=TRUE),
                    "A&D" = sum(dataset$C == 1 & dataset$D == 1, na.rm = TRUE),
                    "A&E" = sum(dataset$C == 1 & dataset$E == 1, na.rm = TRUE),
                    "A&F" = sum(dataset$C == 1 & dataset$F == 1, na.rm = TRUE),
                    "B&C" = sum(dataset$B == 1 & dataset$C == 1, na.rm=TRUE),
                    "B&D" = sum(dataset$B == 1 & dataset$D == 1, na.rm=TRUE),
                    "B&E" = sum(dataset$B == 1 & dataset$E == 1, na.rm=TRUE),
                    "B&F" = sum(dataset$B == 1 & dataset$F == 1, na.rm=TRUE),
                    "C&D" = sum(dataset$C == 1 & dataset$D == 1, na.rm=TRUE),
                    "C&E" = sum(dataset$C == 1 & dataset$E == 1, na.rm=TRUE),
                    "C&F" = sum(dataset$C == 1 & dataset$F == 1, na.rm=TRUE),
                    "D&E" = sum(dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "D&F" = sum(dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "E&F" = sum(dataset$F == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&B&C" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1, na.rm=TRUE),
                    "A&B&D" = sum(dataset$B == 1 & dataset$A == 1 & dataset$D == 1, na.rm=TRUE),
                    "A&B&E" = sum(dataset$B == 1 & dataset$A == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&B&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&C&D" = sum(dataset$A == 1 & dataset$C == 1 & dataset$D == 1, na.rm=TRUE),
                    "A&C&E" = sum(dataset$B == 1 & dataset$C == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&C&F" = sum(dataset$A == 1 & dataset$C == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&D&E" = sum(dataset$A == 1 & dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&D&F" = sum(dataset$A == 1 & dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&E&F" = sum(dataset$A == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "B&C&D" = sum(dataset$B == 1 & dataset$C == 1 & dataset$D == 1, na.rm=TRUE),
                    "B&C&E" = sum(dataset$B == 1 & dataset$C == 1 & dataset$E == 1, na.rm=TRUE),
                    "B&C&F" = sum(dataset$B == 1 & dataset$C == 1 & dataset$F == 1, na.rm=TRUE),
                    "B&D&E" = sum(dataset$B == 1 & dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "B&D&F" = sum(dataset$B == 1 & dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "B&E&F" = sum(dataset$B == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "C&D&E" = sum(dataset$C == 1 & dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "C&D&F" = sum(dataset$C == 1 & dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "C&E&F" = sum(dataset$C == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "D&E&F" = sum(dataset$D == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&B&C&D" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$D == 1, na.rm=TRUE),
                    "A&B&C&E" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&B&C&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&B&D&E" = sum(dataset$B == 1 & dataset$A == 1 & dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&B&D&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&B&E&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&C&D&E" = sum(dataset$A == 1 & dataset$C == 1 & dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&C&D&F" = sum(dataset$A == 1 & dataset$C == 1 & dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&C&E&F" = sum(dataset$A == 1 & dataset$C == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&D&E&F" = sum(dataset$A == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "B&C&D&E" = sum(dataset$B == 1 & dataset$C == 1 & dataset$D == 1 & dataset$E, na.rm=TRUE),
                    "B&C&D&F" = sum(dataset$B == 1 & dataset$C == 1 & dataset$D == 1 & dataset$F, na.rm=TRUE),
                    "B&C&E&F" = sum(dataset$B == 1 & dataset$C == 1 & dataset$E == 1 & dataset$F, na.rm=TRUE),
                    "B&D&E&F" = sum(dataset$B == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F, na.rm=TRUE),
                    "C&D&E&F" = sum(dataset$C == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F, na.rm=TRUE),
                    "A&B&C&D&E" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$D == 1 & dataset$E == 1, na.rm=TRUE),
                    "A&B&C&D&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$D == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&B&C&E&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&B&D&E&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&C&D&E&F" = sum(dataset$C == 1 & dataset$A == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "B&C&D&E&F" = sum(dataset$C == 1 & dataset$B == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE),
                    "A&B&C&D&E&F" = sum(dataset$B == 1 & dataset$A == 1 & dataset$C == 1 & dataset$D == 1 & dataset$E == 1 & dataset$F == 1, na.rm=TRUE)
                )

    venn_primary <- euler(euler_primary)
    plot(venn_primary6,
            quantities = list(cex = .75),
            fill = list(c("red", "blue", "green", "violet", "orange", "brown")),
            lty = 1,
            cex = 0.5,
            labels = NULL,
            legend = list(labels = letters[1:6]))

The above codes result in the following plot:

An euler diagram with 6 variables

However, now I need to make an euler diagram with 11 variables. It seems impossible to make such a diagram with 11 variables as the combinations of variables will reach hundreds, if not thousands. I think creating a function to assign the letters and create the list may be the solution. However, as I am a novice in data cleaning and conditionals in R, I am not able to come up with such a function. Can anyone help me to create a function where I can just enter the dataset name and the columns which will be included in the diagram, and the function will do the rest of the cleaning?

#p.s.: I noticed that the eulerr package requires us to add the & between variables to denote overlapping cases. For example, if we want to see the intersection between variable A and B, we will need to create an exact variable of A&B.

Thank you very much in advance

  • It's easier to help you if you include a simple [reproducible example](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) with sample input that can be used to test and verify possible solutions. I really don't see how it would be practical to have an euler diagram with 11 different variables if all possible overlaps exist. Is there some strcuture you are counting out that many of the overlaps are empty? Like where would values for A&F go in the sample plot? – MrFlick Sep 04 '21 at 08:07
  • Thank you for the suggestion. I have added an example and updated the analysis accordingly. Yes I know it is not practical and not reader-friendly, but since my supervisor asks me to create such a diagram, I think I won't be able to say such a thing before I prove him wrong with the figure. – amedicalenthusiast Sep 04 '21 at 08:21

1 Answers1

1

I tried to use tidyverse to create your plot with out any manual input, but the final plot is somehow flawed. Perhaps you are able to create your plot with 11 variables based on this code and you somehow fix the issue of missing labels inside the plot.

dataset <- data.frame(
  A = rep(c(1, 2, NA), length.out = 100),
  B = rep(c(2, NA, 1), length.out = 100),
  C = rep(c(NA, 1, 2), length.out = 100),
  D = rep(c(NA, 2, 1), length.out = 100),
  E = rep(c(1, NA, 2), length.out = 100),
  F = rep(c(1, 2, NA), length.out = 100)) 

library(tidyverse)
library(eulerr)

map(1:6, ~combn(names(dataset), .x)) %>% 
  map_df(~.x %>% 
           split(rep(1:ncol(.x), each = nrow(.x))) %>% 
           `names<-`(., paste0(split(.x, rep(1:ncol(.x), each = nrow(.x))))) %>% 
           map_df(~dataset %>% 
                    mutate(rn = row_number()) %>% 
                    pivot_longer(-rn) %>% 
                    filter(name %in% .x, value == 1) %>% 
                    group_by(rn) %>% 
                    filter(n() == length(.x)) %>% 
                    ungroup() %>% 
                    summarise(n = n_distinct(rn)),
                  .id = "cols")) %>% 
  mutate(cols = str_remove(str_replace_all(str_remove_all(cols, "[^[A-Z]]"), "(\\w)", "\\1&"),
                           "&$")) %>% 
  deframe() %>% 
  euler() %>% 
  plot(fill = list(c("red", "blue", "green", "violet", "orange", "brown")),
       lty = 1,
       cex = 0.5,
       labels = NULL,
       legend = list(labels = letters[1:6]))

enter image description here

Martin Gal
  • 16,640
  • 5
  • 21
  • 39