0

Thanks in advance for any help or input.

I am getting this error: "Error in abs(x) : non-numeric argument to mathematical function" when I try to plot the edited data frame.

Here's what I'm trying to do, I have no idea where the error is coming from:

  1. Separate age increments into separate columns
  2. Get rid of all the final entries with a + (or ideally change them all to ... to 100)
  3. Calculate the range of each increment
  4. Plot the various increments in a horizontal stacked bar plot

This is the code.:

require(tidyr)
require(dplyr)
require(ggplot2)

CoRo_age <- CoRo_age %>%
separate(age_cat, c("Increment 1", "Increment 2", "Increment 3", "Increment 4", "Increment 5", 
"Increment 6", "Increment 7", "Increment 8", "Increment 9", "Increment 10", "Increment 11", 
"Increment 12","Increment 13", "Increment 14", "Increment 15", "Increment 16","Increment 17"), ",")

CoRo_age$'Increment 2'[!grepl("-",CoRo_age$'Increment 2')] <- ""
CoRo_age$'Increment 3'[!grepl("-",CoRo_age$'Increment 3')] <- ""
CoRo_age$'Increment 4'[!grepl("-",CoRo_age$'Increment 4')] <- ""
CoRo_age$'Increment 5'[!grepl("-",CoRo_age$'Increment 5')] <- ""
CoRo_age$'Increment 6'[!grepl("-",CoRo_age$'Increment 6')] <- ""
CoRo_age$'Increment 7'[!grepl("-",CoRo_age$'Increment 7')] <- ""
CoRo_age$'Increment 8'[!grepl("-",CoRo_age$'Increment 8')] <- ""
CoRo_age$'Increment 9'[!grepl("-",CoRo_age$'Increment 9')] <- ""
CoRo_age$'Increment 10'[!grepl("-",CoRo_age$'Increment 10')] <- ""
CoRo_age$'Increment 11'[!grepl("-",CoRo_age$'Increment 11')] <- ""
CoRo_age$'Increment 12'[!grepl("-",CoRo_age$'Increment 12')] <- ""
CoRo_age$'Increment 13'[!grepl("-",CoRo_age$'Increment 13')] <- ""
CoRo_age$'Increment 14'[!grepl("-",CoRo_age$'Increment 14')] <- ""
CoRo_age$'Increment 15'[!grepl("-",CoRo_age$'Increment 15')] <- ""
CoRo_age$'Increment 16'[!grepl("-",CoRo_age$'Increment 16')] <- ""
CoRo_age$'Increment 17'[!grepl("-",CoRo_age$'Increment 17')] <- ""

CoRo_diff <- apply(CoRo_age[1:50,2:17], c(1,2), function(x) abs(as.numeric(unlist(eval(parse(text=x))))))

CoRo_age <- as.data.frame(cbind(CoRo_age[,1], CoRo_diff))

colnames(CoRo_age)[1] <- c("States")

CoRo_age %>% 
   gather("Increments", "Range", -States) %>% 
   ggplot(aes(x=States, y=Range)) + 
   geom_bar(stat="identity", fill=Increments) +
   coord_flip()

This is the dput output for the data frame:

CoRo_age <- structure(list(state = c("Minnesota", "New York", "North Carolina", 
    "South Carolina", "Delaware", "Illinois", "Louisiana", "Georgia", 
    "Arkansas", "Connecticut", "Idaho", "Maryland", "Massachusetts", 
    "Tennessee", "Michigan", "California", "Virginia", "District of Columbia", 
    "Kentucky", "Arizona", "Maine", "Colorado", "Hawaii", "Indiana", 
    "Iowa", "Kansas", "Mississippi", "Missouri", "Montana", "Nebraska", 
    "Nevada", "New Hampshire", "New Jersey", "New Mexico", "Texas", 
    "Rhode Island", "South Dakota", "Alabama", "Alaska", "Oklahoma", 
    "Oregon", "Pennsylvania", "Utah", "Wisconsin", "West Virginia", 
    "Vermont", "Wyoming", "Washington", "North Dakota", "Florida"
    ), age_cat = c("0-5,6-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,90-99,100+", 
    "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,90+", "0-17,18-24,25-49,50-64,65-74,75+", 
    "11-20,21-30,31-40,41-50,51-60,61-70,71-80,81+", "0-4,5-17,18-34,35-49,50-64,65+", 
    "0-20,20-29,30-39,40-49,50-59,60-69,70-79,80+", "0-5,6-17,18-29,30-39,40-49,50-59,60-69,70+", 
    "0-1,01-04,05-09,10-17,18-29,30-39,40-49,50-59,60-69,70-79,80+", 
    "0-17,18-24,25-44,45-64,65+", "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", 
    "0-18,18-29,30-39,40-49,50-59,60-69,70-79,80-89,90-99,100+", 
    "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", "0-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", 
    "0-10,11-20,21-30,41-50,51-60,61-70,71-80,80+", "0-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", 
    "0-17,18-49,50-64,65+", "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", 
    "0-4,5-14,15-19,20-24,25-34,35-44,45-54,55-64,65-74,75+", "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", 
    "0-20,20-44,45-54,55-64,65,+", "0-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", 
    "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", "0-19,20-39,40-59,60+", 
    "0-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", "0-17,18-40,41-60,61-80,80+", 
    "0-9,10-17,18-24,25-34,35-44,45-54,55-64,65-74,75-84,85+", "0-18,18-29,30-39,40-49,50-59,60-69,70-79,80-89,90+", 
    "0-9,10-19,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85-90,90+", 
    "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", "0-19,20-34,35-44,45-54,55-64,65-74,75-84,85+", 
    "0-10,10-19,20-29,30-39,40-49,50-59,60-69,70+", "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", 
    "0-4,5-17,18-29,30-49,50-64,65-79,80+", "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,90+", 
    "0-1,1-9,10-19,20-29,30-39,40-49,50-59,60-64,65-69,70-74,75-79,80+", 
    "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,90-99,100+", 
    "0-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", "0-4,5-24,25-49,50-64,65+,", 
    "0-10,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", "0-4,5-17,18-35,36-49,50-64,65+", 
    "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,90-99,>100", 
    "1-14,15-24,25-44,45-64,65-84,85+", "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-89,90+", 
    "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70+", "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", 
    "0-18,19-29,30-39,40-49,50-59,60-69,70-79,80+", "00-19,20-39,40-59,60-79,80+", 
    "0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80+", "0-4,5-14,15-24,25-34,35-44,45-54,55-64,65-74,75-84,85+"
    )), row.names = c(1L, 3L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 
    21L, 24L, 25L, 28L, 36L, 39L, 40L, 41L, 55L, 56L, 57L, 58L, 59L, 
    60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 
    73L, 75L, 76L, 81L, 82L, 83L, 86L, 87L, 88L, 97L, 98L, 101L, 
    104L, 126L), class = "data.frame")
  • Hi there! Welcome to SO! It would be helpful if your question includes a minimal and reproducible example (for example, we don't know what `CoRo_age` is so we can't help you without guessing), Also, try to keep the code in your question to a minimum: only include code directly related to your question. Here's a great explanation of the concepts I mentioned: https://stackoverflow.com/a/5963610/4240010 – yeedle Aug 31 '20 at 14:38
  • A link to the CoRo_age data is posted initially, would that suffice? – Hope Muller Aug 31 '20 at 14:41
  • 1
    That would suffice, but here's another tip for the future: try not to link to external data sources. Instead, export a small subset of your dataframe using `dput` and paste the results into a codeblock in your question – yeedle Aug 31 '20 at 14:41

2 Answers2

1

I'm not sure how edifying the final plot is, but you can obtain it with this fully reproducible example:

library(ggplot2)

url <- "https://raw.githubusercontent.com/HopeMuller/CoRo/master/State_Ages.csv"
CoRo_age <- read.csv(url)[-1]

ranges <- lapply(CoRo_age$age_cat, function(x)
  sapply(strsplit(gsub("^(.*)\\+$", "\\1-100",
                       gsub("\\+,$", "-100",
             gsub("^(.+),+\\+$", "\\1-100", 
               gsub("(100\\+)|(>100)", "100-101",x)))), ",")[[1]], 
         function(x) {
                 sapply(strsplit(x, "-"), function(x) diff(as.numeric(x)) + 1)
               }))

df <- do.call(rbind,
        mapply(function(x, y) {
          data.frame(State = x, range = factor(seq_along(y)), Years = y)
        }, CoRo_age$state, ranges, SIMPLIFY = FALSE))

ggplot(df, aes(State, Years, fill = range)) + 
  geom_col(color = "black", width = 1) +
  coord_flip() +
  theme_bw() +
  theme(legend.position = "none")

enter image description here

Allan Cameron
  • 147,086
  • 7
  • 49
  • 87
1

To be honest, this question is really multiple question disguised as one, and these questions are not so much about ggplot as they are about process: how do we go from a raw and untidy dataframe to the desired plot?

However, given that this is your first question on stackoverflow, I've attempted to answer your question and I hope that you find both value in the answer as well as direction in the future as to how to break up your question (and your code!) into their individual parts.

So first things first.

  1. Separate age increments into separate columns
  2. Get rid of all the final entries with a + (or ideally change them all to ... to 100)

You've correctly used separate to accomplish the first step (although, there's a way to programmatically produces the names of the columns without typing them out as you'll see soon but your approach is correct). However, the issue with doing so in the first step becomes apparent immediately because you then have to modify by hand each of the newly created columns. (Additionally, you are setting NA data to empty strings (""), which is probably where your problem comes from in the first place).

If you take a step back and think about this, you'll find that there's an easier way! We can modify the increments string before doing the separation, removing any entries with a following +. That way, after the separation we have only the values we want:

coro_age <- coro_age %>%
  mutate(age_cat = str_remove(age_cat, ",([0-9]{2})?\\+")) %>% 
  separate(age_cat, paste0('increment_', seq_along(1:17)), sep = ",")

I've used the regex ",([0-9]{2})?\\+" to find any entry that starts with a comma, optionally followed by two numbers, and finally followed by plus sign.

After we've removed the offending entries, separation is a breeze. Notice that I've used seq_along in combination with paste0 to produce the necessary column names rather than typing them all out. Also, I've used underscores in the column names since they are generally more programming-friendly.

Calculate the range of each increment

One big red flag in your code is using parse and eval to find the range values. Very, very rarely is it necessary to reach for these in the course of doing regular data tidying. They can also be dangerous when the data comes from unknown sources. Whats more, if you can do it "tidy-ly", rather than using apply, do it that way!

I've also noticed that right before plotting, you use gather to turn the dataframe into a tidy dataframe. But, it always pays to get the dataframe into a tidy shape as soon as possible, the earlier the better! So let's tidy the dataframe first, and then calculate the ranges:

coro_age <- coro_age %>%
  gather("increment_name", "increment", -state) %>% 
  separate(increment, c("start", "end"), "-", remove = F) %>%
  mutate(start = as.integer(start), end = as.integer(end), range = end - start)

Here, I gathered all the ranges into one column as a first step. Then, I separated the newly created increment column using - as the separator (I made sure not to remove the existing column with the remove=F argument because we'll need it at the plotting stage). Finally I convert the two new columns to integers, and add a new column (called range) that is the range of each increment.

Now with all that done, plotting is as simple as:

ggplot(coro_age, aes(x=state, y=range, fill=increment)) +
  geom_col() +
  coord_flip()

Notice that I've used geom_col which is an alias for geom_bar(stat="indentity") and that I've added the fill argument to the aes function in the ggplot call.

The above produces the following plot:

plot

I hope that this is what you were going for and that I helped you along on your #rstats journey!

yeedle
  • 4,918
  • 1
  • 22
  • 22