Comparing multiple categorical variables in R

Question

So I would like to stack the two bars from each of these graphs into one big graph. That is, I would like Black State Claim (from plot a) to be right next to Black Civil Rights Claim (from plot b) and consequently for all races into one graph.

Since some of the data, like asian, is so low, is there a more ideal way to compare State Claim/Civil Rights Claim Status with Race???

#a) State Claim?        
race_claim <- data.frame(table(jail$Race,jail$State_Claim_Made))
            names(race_claim) <- c("Race","Claim","Count")


    ggplot(data=race_claim, aes(x=Race, y=Count, fill=Claim)) + geom_bar(stat = "identity")

#b) civil rights claim?

race_claim_civ <- data.frame(table(jail$Race,jail$Non_Statutory))
names(race_claim_civ) <- c("Race","Claim","Count")

ggplot(data=race_claim_civ, aes(x=Race, y=Count, fill=Claim)) + geom_bar(stat = "identity")

DATA SAMPLE:

structure(list(Last_Name = c("Banks", "Beamon", "Dandridge", 
"Deakle, Jr.", "Doyle", "Drinkard", "Ellis", "Embry", "Gaines", 
"Gurley", "Hinton", "Holemon", "Holsomback", "Hunt", "Jones", 
"Mahan", "Mahan", "McMillian", "Moore", "Padgett"), First_Name = c("Medell", 
"Melvin Todd", "Beniah Alton", "Evan Lee", "Robert E.", "Gary", 
"Andre", "Anthony", "Freddie Lee", "Timothy", "Anthony", "Jeffrey", 
"John", "H. Guy", "Lydia Diane", "Dale", "Ronnie", "Walter", 
"Daniel Wade", "Larry Randal"), Age = c("27", "24", "29", "59", 
"44", "37", "35", "23", "22", "22", "29", "23", "33", "54", "40", 
"22", "26", "45", "24", "40"), Race = c("Black", "Asian", "Caucasian", 
"Caucasian", "Other", "Asian", "Black", "Black", "Black", 
"Caucasian", "Black", "Caucasian", "Caucasian", "Other", 
"Black", "Caucasian", "Asian", "Black", "Native American", "Caucasian"
), Sex = c("Male", "Male", "Male", "Male", "Male", "Male", "Male", 
"Male", "Male", "Male", "Male", "Male", "Male", "Male", "Female", 
"Male", "Male", "Male", "Male", "Male"), State = c("Alabama", 
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", 
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", 
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", 
"Alabama"), CIU = c(0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 
0, 0, 0, 0, 1, 0), Guilty_Plea = c(1, 0, 0, 0, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), IO = c(0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Worst_Crime = c(6, 1, 
1, 4, 4, 1, 2, 1, 1, 6, 1, 2, 4, 6, 3, 2, 2, 1, 1, 1), Occurred = c(1999, 
1988, 1994, 2014, 1991, 1993, 2012, 1992, 1972, 1999, 1985, 1987, 
1987, 1987, 1997, 1983, 1983, 1986, 1999, 1990), Convicted = c(2001, 
1989, 1996, 2015, 1992, 1995, 2013, 1993, 1974, 2000, 1986, 1988, 
1988, 1993, 2000, 1986, 1986, 1988, 2002, 1992), Exonerated = c(2003, 
1990, 2015, 2015, 2001, 2001, 2014, 1997, 1991, 2002, 2015, 1999, 
2000, 1998, 2006, 1998, 1998, 1993, 2009, 1997), Sentence = c("15", 
"25", "Life", "Not sentenced", "20", "Death", "85", "20", "30", 
"35", "Death", "Life", "25", "Probation", "Life without parole", 
"35", "Life without parole", "Death", "Death", "Death"), Death_Penalty = c(0, 
0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1), DNA_Only = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0), FC = c(1, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), MWID = c(0, 
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0), F_MFE = c(0, 
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1), P_FA = c(1, 
1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0), OM = c(1, 
1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1), ILD = c(0, 
0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0), State_Statute = c("Y", 
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", 
"Y", "Y", "Y", "Y", "Y", "Y"), State_Claim_Made = c(0, 0, 1, 
0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1 0), Zero_time = c(0, 
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Prem = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Pending = c(0, 
0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), Denied = c(0, 
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), State_Award = c("0", 
"0", "2", "0", "1", "0", "0", "0", "1", "0", "2", "0", "0", "0", 
"0", "0", "0", "0", "0", "0"), Amount = c("0", "0", NA, "0", 
"129041.88", "0", "0", "0", "1000000", "0", NA, "0", "0", "0", 
"0", "0", "0", "0", "0", "0"), `Non-Statutory_Case_Filed` = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0), No_Time = c(0, 
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), Unfiled = c(1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1), Dismissed = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0), Pending__1 = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Award = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), Premature = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Amount__1 = c("0", 
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", 
"0", "0", "0", "$ undisclosed", "0", "0"), Years_Lost = c(1.7, 
0.1, 19.5, 0, 2.6, 5.7, 1.8, 4, 10.7, 1.5, 28.5, 10.6, 10.1, 
0, 5.8, 11.4, 11.4, 4.5, 5.4, 5.5), State_Award2 = c("0", "0", 
"0", "0", "1", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", 
"0", "0", "0", "0", "0")), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))

Something seems to be off with the `structure` you posted. Could you check again? Also it would be helpful if you could provide copy your output plot into the question. — Roman, Nov 24 '18 at 10:58

utubun · Accepted Answer · 2018-11-24T16:15:10.323

I think there is a clash between two requirements: to make the barplot stack-ed and at the same time - dodge-d. Probably my solution isn't the best, and someone would do better. But that's what I've got right now:

Preprocessing

library(tidyverse)

dat <- jail %>%
  rename_all(tolower) %>%
  select(race, state_claim_made, non_statutory_case_filed) %>%
  gather(key = action, value = claim, 2, 3) %>% 
  count(race, action, claim) %>% 
  mutate(action = ifelse(action == "state_claim_made", "state", "civil")) %>%
  mutate(x = as.numeric(reorder(interaction(race, action), 1:n())))

Output:

# # A tibble: 15 x 5
#    race            action claim     n     x
# <chr>           <chr>  <dbl> <int> <dbl>
#  1 Asian           civil      0     3     1
#  2 Asian           state      0     2     2
#  3 Asian           state      1     1     2
#  4 Black           civil      0     6     3
#  5 Black           civil      1     1     3
#  6 Black           state      0     3     4
#  7 Black           state      1     4     4
#  8 Caucasian       civil      0     7     5
#  9 Caucasian       state      0     6     6
# 10 Caucasian       state      1     1     6
# 11 Native American civil      1     1     7
# 12 Native American state      1     1     8
# 13 Other           civil      0     2     9
# 14 Other           state      0     1    10
# 15 Other           state      1     1    10

Some necessary tweaks for x-axis labels:

Adapted from this answer:

breaks = sort(c(unique(dat$x), seq(min(dat$x) + .5, 
                                   max(dat$x) + .5, 
                                   length(unique(dat$action))
                                   )
                )
              )

labels = unlist(
  lapply(unique(dat$race), function(i) c("civil", paste0("\n", i), "state"))
  )

Plot data

ggplot(dat, aes(x = x, y = n, fill = factor(claim))) +
  geom_col(show.legend = T) + 
  ggthemes::theme_few() +
  scale_fill_manual(name = NULL,
                      values = c("gray75", "gray25"),
                      breaks= c("0", "1"),
                      labels = c("false", "true")
                      ) +
  scale_x_continuous(breaks = breaks, labels = labels) +
  theme(axis.title.x = element_blank(), axis.ticks.x = element_blank()) +
  labs(title = "Jail Plot", y = "Count")

Data

The data you attached are corrupted - missing comma or $ somewhere in the table (I don't remember what that was). There are the same data, but without variables we don't to solve the problem.

structure(
  list(Race = c("Black", "Asian", "Caucasian", "Caucasian", "Other", "Asian", 
                "Black", "Black", "Black", "Caucasian", "Black", "Caucasian", 
                "Caucasian", "Other", "Black", "Caucasian", "Asian", "Black", 
                "Native American", "Caucasian"), 
       State_Claim_Made = c(0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 
                            0, 1, 0), 
       Non_Statutory_Case_Filed = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                                    0, 0, 0, 1, 1, 0)
       ), 
  row.names = c(NA, -20L), 
  class = c("tbl_df", "tbl", "data.frame")
  )

Wow good stuff, wish I could code like this haha. Now I would like to further compare these groups. To see if the proportion of True/False (Filed a claim vs. Not filed a claim) is the same for each race, how would I do an ANOVA test with this setup? — Juanito Tomas, Nov 26 '18 at 23:14

Comparing multiple categorical variables in R

1 Answers1

Preprocessing

Some necessary tweaks for x-axis labels:

Plot data

Data