Add new column to state data frame based on other column data

Question

I am attempting to add a new column to the state sample data frame in R. I am hoping for this column to cluster the ID of states into broader categories (1-4). My code is close to what I am looking for but I am not getting it quite right.. I know I could enter each state ID line by line but is there a a quicker way? Thank you!

library(tidyverse)

#Add column to denote each state

States=state.x77 
States=data.frame(States)
States <- tibble::rowid_to_column(States, "ID")
States

#Create new variable for state buckets

States <- States %>% 
  mutate(WAGE_BUCKET=case_when(ID <= c(1,12) ~ '1',
                               ID <= c(13,24) ~ '2',
                               ID <= c(25,37) ~ '3',
                               ID <= c(38,50) ~ '4',
                              TRUE ~ 'NA'))
View(States) #It is not grouping the states in the way I want/I am still getting some NA values but unsure why!

score 1 · Answer 1 · answered Mar 05 '21 at 19:02

You can use cut or findInterval if all of your groups will be using contiguous ID values:

findInterval(States$ID, c(0, 12, 24, 37, 51))
#  [1] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4

If you want to make it a bit more verbose, you can use dplyr::between in your case_when:

States %>%
  mutate(
    WAGE_BUCKET = case_when(
      between(ID, 1, 12) ~ "1",
      between(ID, 13, 24) ~ "2",
      between(ID, 25, 37) ~ "3",
      between(ID, 38, 50) ~ "4",
      TRUE ~ NA_character_)
  )
#    ID Population Income Illiteracy Life Exp Murder HS Grad Frost   Area WAGE_BUCKET
# 1   1       3615   3624        2.1    69.05   15.1    41.3    20  50708           1
# 2   2        365   6315        1.5    69.31   11.3    66.7   152 566432           1
# 3   3       2212   4530        1.8    70.55    7.8    58.1    15 113417           1
# 4   4       2110   3378        1.9    70.66   10.1    39.9    65  51945           1
# 5   5      21198   5114        1.1    71.71   10.3    62.6    20 156361           1
# 6   6       2541   4884        0.7    72.06    6.8    63.9   166 103766           1
# 7   7       3100   5348        1.1    72.48    3.1    56.0   139   4862           1
# 8   8        579   4809        0.9    70.06    6.2    54.6   103   1982           1
# 9   9       8277   4815        1.3    70.66   10.7    52.6    11  54090           1
# 10 10       4931   4091        2.0    68.54   13.9    40.6    60  58073           1
# 11 11        868   4963        1.9    73.60    6.2    61.9     0   6425           1
# 12 12        813   4119        0.6    71.87    5.3    59.5   126  82677           1
# 13 13      11197   5107        0.9    70.14   10.3    52.6   127  55748           2
# 14 14       5313   4458        0.7    70.88    7.1    52.9   122  36097           2
# 15 15       2861   4628        0.5    72.56    2.3    59.0   140  55941           2
# 16 16       2280   4669        0.6    72.58    4.5    59.9   114  81787           2
# 17 17       3387   3712        1.6    70.10   10.6    38.5    95  39650           2
# 18 18       3806   3545        2.8    68.76   13.2    42.2    12  44930           2
# 19 19       1058   3694        0.7    70.39    2.7    54.7   161  30920           2
# 20 20       4122   5299        0.9    70.22    8.5    52.3   101   9891           2
# 21 21       5814   4755        1.1    71.83    3.3    58.5   103   7826           2
# 22 22       9111   4751        0.9    70.63   11.1    52.8   125  56817           2
# 23 23       3921   4675        0.6    72.96    2.3    57.6   160  79289           2
# 24 24       2341   3098        2.4    68.09   12.5    41.0    50  47296           2
# 25 25       4767   4254        0.8    70.69    9.3    48.8   108  68995           3
# 26 26        746   4347        0.6    70.56    5.0    59.2   155 145587           3
# 27 27       1544   4508        0.6    72.60    2.9    59.3   139  76483           3
# 28 28        590   5149        0.5    69.03   11.5    65.2   188 109889           3
# 29 29        812   4281        0.7    71.23    3.3    57.6   174   9027           3
# 30 30       7333   5237        1.1    70.93    5.2    52.5   115   7521           3
# 31 31       1144   3601        2.2    70.32    9.7    55.2   120 121412           3
# 32 32      18076   4903        1.4    70.55   10.9    52.7    82  47831           3
# 33 33       5441   3875        1.8    69.21   11.1    38.5    80  48798           3
# 34 34        637   5087        0.8    72.78    1.4    50.3   186  69273           3
# 35 35      10735   4561        0.8    70.82    7.4    53.2   124  40975           3
# 36 36       2715   3983        1.1    71.42    6.4    51.6    82  68782           3
# 37 37       2284   4660        0.6    72.13    4.2    60.0    44  96184           3
# 38 38      11860   4449        1.0    70.43    6.1    50.2   126  44966           4
# 39 39        931   4558        1.3    71.90    2.4    46.4   127   1049           4
# 40 40       2816   3635        2.3    67.96   11.6    37.8    65  30225           4
# 41 41        681   4167        0.5    72.08    1.7    53.3   172  75955           4
# 42 42       4173   3821        1.7    70.11   11.0    41.8    70  41328           4
# 43 43      12237   4188        2.2    70.90   12.2    47.4    35 262134           4
# 44 44       1203   4022        0.6    72.90    4.5    67.3   137  82096           4
# 45 45        472   3907        0.6    71.64    5.5    57.1   168   9267           4
# 46 46       4981   4701        1.4    70.08    9.5    47.8    85  39780           4
# 47 47       3559   4864        0.6    71.72    4.3    63.5    32  66570           4
# 48 48       1799   3617        1.4    69.48    6.7    41.6   100  24070           4
# 49 49       4589   4468        0.7    72.48    3.0    54.5   149  54464           4
# 50 50        376   4566        0.6    70.29    6.9    62.9   173  97203           4

akrun · Answer 2 · 2021-03-05T19:14:58.797

It is a vector of length > 1. The comparison operators works on a single vector. We could use between

library(dplyr)
States <- States %>% 
   mutate(WAGE_BUCKET=case_when(between(ID, 1, 12) ~ '1',
                           between(ID, 13,24) ~ '2',
                           between(ID, 25,37) ~ '3',
                           between(ID, 38,50) ~ '4',
                          TRUE ~ NA_character_))

Or another option is to use & with > and <=

States %>% 
  mutate(WAGE_BUCKET=case_when(ID >= 1 & ID <=12  ~ '1',
                           ID >= 13 & ID <= 24) ~ '2',
                           ID >= 25 & ID <= 37 ~ '3',
                           ID >= 38 & ID <= 50 ~ '4',
                          TRUE ~ NA_character))

Or may be the OP meant to use %in%

States %>% 
  mutate(WAGE_BUCKET=case_when(ID %in% c(1,12) ~ '1',
                               ID  %in% c(13,24) ~ '2',
                               ID %in% c(25,37) ~ '3',
                               ID %in% c(38,50) ~ '4',
                              TRUE ~ NA_character_))

Add new column to state data frame based on other column data

2 Answers2