0

I have a dataset of which I want to create a new column with names small, medium and large, these will be dependent on the values of the year 2014. where 1:5 = small and 6:9 = medium whilst 10:14 = large. The values will be yes or no, dependning on the value of the year.

This is how my dataset looks like:

 A tibble: 330 x 4
   LOC_ID     season `2014` `2015`
   <chr>      <chr>   <int>  <int>
 1 LOC1002793 Summer     12     NA
 2 LOC1002793 Winter      6     NA
 3 LOC1004001 Winter     NA      1
 4 LOC1004488 Winter      8     NA
 5 LOC1012349 Summer     12     12
 6 LOC1012349 Winter     11     12
 7 LOC1019836 Summer     14     10
 8 LOC1019836 Winter     12     12
 9 LOC1022032 Winter     NA      1
10 LOC1034172 Summer     13     11
# ... with 320 more rows

The counts are split into years 2014 and 2015, these counts represent the amount of times in a year, within the weeks of that season, that the locality id made an observation (in this case the observations are birds).

I want something like this (I will be separating the 2014/2015 dataframes into two dataframes, so doing it for one year I can then replicate the code for the other):

 A tibble: 330 x 4
   LOC_ID     season `2014` `2015`  small    medium   large
   <chr>      <chr>   <int>  <int>
 1 LOC1002793 Summer     12     NA   no        no      yes
 2 LOC1002793 Winter      6     NA   no        yes      no
 3 LOC1004001 Winter     NA      1   
 4 LOC1004488 Winter      8     NA   no        yes      no
 5 LOC1012349 Summer     12     12   no        no      yes
 6 LOC1012349 Winter     11     12   .         .        .
 7 LOC1019836 Summer     14     10   .         .        .
 8 LOC1019836 Winter     12     12
 9 LOC1022032 Winter     NA      1
10 LOC1034172 Summer     13     11
# ... with 320 more rows

This is what I have tried:

#replicate the years and name those replicates test1 = 2014 and test2 = 2015
A tibble: 330 x 6
   LOC_ID     season test1 `2014` test2 `2015`
   <chr>      <chr>  <int>  <int> <int>  <int>
 1 LOC1002793 Summer    12     12    NA     NA
 2 LOC1002793 Winter     6      6    NA     NA
 3 LOC1004001 Winter    NA     NA     1      1
 4 LOC1004488 Winter     8      8    NA     NA
 5 LOC1012349 Summer    12     12    12     12
 6 LOC1012349 Winter    11     11    12     12
 7 LOC1019836 Summer    14     14    10     10
 8 LOC1019836 Winter    12     12    12     12
 9 LOC1022032 Winter    NA     NA     1      1
10 LOC1034172 Summer    13     13    11     11
# ... with 320 more rows

ld <- d %>% mutate(test1 = recode(test1, `1:5` = 'low', `6:9` = 'medium', `10:14` = 'high')) %>% pivot_wider(names_from = test1, values_from = '2014')

Reproducible code:

structure(list(LOC_ID = c("LOC1002793", "LOC1002793", "LOC1004001", 
"LOC1004488", "LOC1012349", "LOC1012349", "LOC1019836", "LOC1019836", 
"LOC1022032", "LOC1034172", "LOC1034172", "LOC1039789", "LOC1040038", 
"LOC1040038", "LOC1047222314194", "LOC1047222314194", "LOC1048553080056", 
"LOC1049318", "LOC1049318", "LOC1049970899816", "LOC1049970899816", 
"LOC1066628", "LOC1066628", "LOC1071566", "LOC1071566", "LOC1071569", 
"LOC1071569", "LOC1073191", "LOC1073191", "LOC1073423", "LOC1073423", 
"LOC1079978", "LOC1079978", "LOC1083442", "LOC1083442", "LOC1086293", 
"LOC1086293", "LOC1087213", "LOC1087213", "LOC1088795", "LOC1088795", 
"LOC1122438", "LOC1122438", "LOC1139319877260", "LOC1139319877260", 
"LOC1153084541859", "LOC1153084541859", "LOC1155749", "LOC1163128", 
"LOC1163128", "LOC1234081", "LOC1234081", "LOC1289919", "LOC1289919", 
"LOC1294966340210", "LOC1300602115", "LOC1300602115", "LOC1300602122", 
"LOC1300602122", "LOC1300602135", "LOC1300602135", "LOC1300602161", 
"LOC1300602161", "LOC1300602184", "LOC1300602184", "LOC1300602196", 
"LOC1300602196", "LOC1300602243", "LOC1300602243", "LOC1300602306", 
"LOC1300602306", "LOC1300604079", "LOC1300604079", "LOC1300604135", 
"LOC1300604135", "LOC1300604635", "LOC1300604635", "LOC1300604699", 
"LOC1300604699", "LOC1300604713"), season = c("Summer", "Winter", 
"Winter", "Winter", "Summer", "Winter", "Summer", "Winter", "Winter", 
"Summer", "Winter", "Winter", "Summer", "Winter", "Summer", "Winter", 
"Summer", "Summer", "Winter", "Summer", "Winter", "Summer", "Winter", 
"Summer", "Winter", "Summer", "Winter", "Summer", "Winter", "Summer", 
"Winter", "Summer", "Winter", "Summer", "Winter", "Summer", "Winter", 
"Summer", "Winter", "Summer", "Winter", "Summer", "Winter", "Summer", 
"Winter", "Summer", "Winter", "Winter", "Summer", "Winter", "Summer", 
"Winter", "Summer", "Winter", "Winter", "Summer", "Winter", "Summer", 
"Winter", "Summer", "Winter", "Summer", "Winter", "Summer", "Winter", 
"Summer", "Winter", "Summer", "Winter", "Summer", "Winter", "Summer", 
"Winter", "Summer", "Winter", "Summer", "Winter", "Summer", "Winter", 
"Summer"), `2015` = c(NA, NA, 1L, NA, 12L, 12L, 10L, 12L, 1L, 
11L, 12L, NA, 2L, 5L, 11L, 9L, NA, 5L, 7L, 13L, 12L, 9L, 11L, 
9L, 11L, 11L, 7L, 7L, 12L, 8L, 12L, 12L, 7L, 13L, 12L, 12L, 12L, 
4L, 8L, 7L, 10L, 7L, 4L, 12L, 12L, 2L, 5L, NA, NA, 9L, 12L, 7L, 
11L, 4L, 8L, 11L, 12L, 13L, 12L, 10L, 12L, 12L, 12L, 11L, 4L, 
13L, 12L, 12L, 12L, 10L, 10L, 11L, 10L, 12L, 11L, 9L, 11L, 9L, 
10L, 13L)), row.names = c(NA, -80L), class = c("tbl_df", "tbl", 
"data.frame"))

Lime
  • 738
  • 5
  • 17
  • Can you run `dput(head(data, 10))` and paste the outputs into the question? This makes it easier to copy your data into R for us to work with to provide you the best possible answer. – Ben Norris Oct 20 '20 at 10:35
  • oops I was meant to do that, have a look now – Lime Oct 20 '20 at 10:36

1 Answers1

1

Here is a tidyverse way.

library(dplyr)
library(tidyr)

data %>% 
  mutate(names = cut(`2014`,    # cut converts a numeric vector into a factor
                     breaks = c(1, 5, 9, 14), 
                     labels = c("small", "medium", "large")),
         values = if_else(is.na(names), "NA" , "yes")) %>%   # adds your "yes" values
  pivot_wider(names_from = names, 
              values_from = values) %>%  # makes the cut labels into columns
  select(-`NA`) %>% # an undesired `NA` column was created and needs to be removed
  mutate(across(large:small, 
                ~replace_na(., "no"))) # replace NA values with "no"

# A tibble: 50 x 7
   LOC_ID     season `2014` `2015` large medium small
   <chr>      <chr>   <int>  <int> <chr> <chr>  <chr>
 1 LOC1002793 Summer     12     NA yes   no     no   
 2 LOC1002793 Winter      6     NA no    yes    no   
 3 LOC1004001 Winter     NA      1 no    no     no   
 4 LOC1004488 Winter      8     NA no    yes    no   
 5 LOC1012349 Summer     12     12 yes   no     no   
 6 LOC1012349 Winter     11     12 yes   no     no   
 7 LOC1019836 Summer     14     10 yes   no     no   
 8 LOC1019836 Winter     12     12 yes   no     no   
 9 LOC1022032 Winter     NA      1 no    no     no   
10 LOC1034172 Summer     13     11 yes   no     no   
# ... with 40 more rows
Ben Norris
  • 5,639
  • 2
  • 6
  • 15
  • shouldn't the NAs trigger a blank " " instead of a "no" in the 3 "size" columns? Elegant answer anyways – flo Oct 20 '20 at 10:53
  • This works perfectly for year 2014, although, with year 2015 it fails to provide `yes` with values given `1`, it also returns many `NAs` on the medium column instead of `no`. I have updated the `dput` code for only 2015 if you want to give it a look. – Lime Oct 20 '20 at 11:02
  • I figured it out using this `year_2015[which(year_2015[,4]== 1, ), 6] <- "yes"` to convert the values `1` in small into `no` and replaced the `NAs` using `data[is.na(data)] <- "no"` – Lime Oct 20 '20 at 22:05