0

I have this in my dataframe after reading and rearranging multiple csv files. Basically I want an if else ladder to refer to the ID column and if it matches a number from the list of concatenates then place a word in a new "group" column

    # of int.   int.     not.int.  ID
1      50      218.41     372.16    1
3      33      134.94     158.17    3

I then made these concatenates to refer to.

veh = as.character(c('1', '5'))
thc1 = as.character(c('2', '6'))
thc2 = as.character(c('3', '7'))
thc3 = as.character(c('4', '8')) 

Then I made an if else ladder to list the corresponding values.

social.dat$group =  if (social.dat$ID == veh) {
     social.dat$group == "veh"
     } else if (social.dat$group == thc1) {
     social.dat$group == "thc1"
     } else if (social.dat$group == thc2) {
     social.dat$group == "thc2"
     } else {
     social.dat$group == "thc3"
     }

However, I then get this warning message.

Warning message:
In if (social.dat$ID == veh) { :
the condition has length > 1 and only the first element will be used

I have looked up this warning message in multiple different variations and have not found anything that really helped. Any help for this would be much appreciated or and alternate options would be good as well. I apologize in advance if there was a solution on stack already if I missed it.

EDIT: I tried using

 social.dat$group = ifelse(social.dat$ID == veh, "veh", "thc")
 social.dat$group = ifelse(social.dat$ID == thc, "thc", "veh")

but it changed the output of the dataframe after each line.

Here is the full code i am using to rearrange the csv files and get the dataframe that I first mentioned above.

#calls packages
library(tidyr)
library( plyr )
library(reshape2)
#make sure to change your working directory to where the files are kept
setwd("C:/Users/callej03/Desktop/test")
wd = "C:/Users/callej03/Desktop/test"
files = list.files(path=wd, pattern="*.csv", full.names=TRUE, 
recursive=FALSE)

################################################################
#this function creates a list of the number of interactions for each file in 
the folder
lap.list = lapply(files, function(x) {
dat = read.csv(x, header= TRUE) 
dat = dat[-c(1),]
dat = as.data.frame(dat)
dat = separate(data = dat, col = dat, into = c("lap", "duration"), sep = "\\      
")
dat$count = 1:nrow(dat) 
y = dat$count
i= y%%2==0
dat$interacting = i
int = dat[which(dat$interacting == TRUE),]
interactions = sum(int$interacting)
})
#########################################################################
#this changes the row name to the name of the file - i.e. the rat ID
lap.list = as.data.frame(lap.list)
lap.list = t(lap.list)
colnames(lap.list) = c("# of int.")
row.names(lap.list) = sub(wd, "", files)
row.names(lap.list) = gsub("([0-9]+).*$", "\\1", rownames(lap.list))
row.names(lap.list) = gsub('/', "", row.names(lap.list), fixed = TRUE)

###########################################################################
#this applies almost the same function as the one listed above except I call 
it a different vector name so it can be manipulated
int.duration = lapply(files, function(x) {
dat2 = read.csv(x, header= TRUE) 
dat2 = dat2[-c(1),]
dat2 = as.data.frame(dat2)
dat2 = separate(data = dat2, col = dat2, into = c("lap", "duration"), sep = 
"\\      ")
dat2$count = 1:nrow(dat2) 
y = dat2$count
i= y%%2==0
dat2$interacting = i
int = dat2[which(dat2$interacting == TRUE),]
})

noint.duration = lapply(files, function(x) {
dat2 = read.csv(x, header= TRUE) 
dat2 = dat2[-c(1),]
dat2 = as.data.frame(dat2)
dat2 = separate(data = dat2, col = dat2, into = c("lap", "duration"), sep = 
"\\      ")
dat2$count = 1:nrow(dat2) 
y = dat2$count
i= y%%2==0
dat2$interacting = i
noint = dat2[which(dat2$interacting == FALSE),]
})
###################################################################
#this splits the output time of minutes, seconds, and milliseconds. 
#then it combines them into a total seconds.milliseconds readout.
#after that, it takes the sum of the times for each file and combines them 
with the total interactions dataframe.
int.duration = melt(int.duration)
int.duration = as.data.frame(int.duration)
int.left = as.data.frame(substr(int.duration$duration, 1, 2))
colnames(int.left) = "min"
int.mid = as.data.frame(substr(int.duration$duration, 4, 4 + 2 - 1))
colnames(int.mid) = "sec"
int.right = as.data.frame(substr(int.duration$duration, 
nchar(int.duration$duration) - (2-1), nchar(int.duration$duration)))
colnames(int.right) = "ms"
int.time = cbind(int.left, int.mid, int.right)
int.time$min = as.numeric(as.character(int.time$min))
int.time$sec = as.numeric(as.character(int.time$sec))
int.time$ms = as.numeric(as.character(int.time$ms))
int.time$ms = int.time$ms/100
int.time$min = ifelse(int.time$min > 0, int.time$min*60,0) 
int.time$sum = rowSums(int.time)
int.file = as.data.frame(int.duration$L1)
colnames(int.file) = "file"
int.time = cbind(int.time, int.file)
int.tot = as.data.frame(tapply(int.time$sum, int.time$file, sum))
colnames(int.tot) = "int."
social.dat = cbind(lap.list, int.tot)

noint.duration = melt(noint.duration)
noint.duration = as.data.frame(noint.duration)
noint.left = as.data.frame(substr(noint.duration$duration, 1, 2))
colnames(noint.left) = "min"
noint.mid = as.data.frame(substr(noint.duration$duration, 4, 4 + 2 - 1))
colnames(noint.mid) = "sec"
noint.right = as.data.frame(substr(noint.duration$duration, 
nchar(noint.duration$duration) - (2-1), nchar(noint.duration$duration)))
colnames(noint.right) = "ms"
noint.time = cbind(noint.left, noint.mid, noint.right)
noint.time$min = as.numeric(as.character(noint.time$min))
noint.time$sec = as.numeric(as.character(noint.time$sec))
noint.time$ms = as.numeric(as.character(noint.time$ms))
noint.time$ms = noint.time$ms/100
noint.time$min = ifelse(noint.time$min > 0, noint.time$min*60,0) 
noint.time$sum = rowSums(noint.time)
noint.file = as.data.frame(noint.duration$L1)
colnames(noint.file) = "file"
noint.time = cbind(noint.time, noint.file)
noint.tot = as.data.frame(tapply(noint.time$sum, noint.time$file, sum))
colnames(noint.tot) = "not.int."
social.dat = cbind(social.dat, noint.tot)
social.dat$ID = rownames(social.dat)

Here is and axample of a csv file I am working with. The words are all in the same column and separated by spaces.

Total time  10:00.61
Lap times
01      00:07.46
02      00:05.64
03      00:01.07
04      00:01.04
05      00:04.71
06      00:06.43
07      00:12.52
08      00:07.34
09      00:05.46
10      00:05.81
11      00:05.52
12      00:06.51
13      00:10.75
14      00:00.83
15      00:03.64
16      00:02.75
17      00:01.20
18      00:06.17
19      00:04.40
20      00:00.75
21      00:00.84
22      00:01.29
23      00:02.31
24      00:03.04
25      00:02.85
26      00:05.86
27      00:05.76
28      00:05.06
29      00:00.96
30      00:06.91
jc2525
  • 141
  • 10
  • 3
    Use `ifelse` instead of `if/else` where length of vector is greater than 1 – akrun Jun 25 '18 at 15:14
  • 1
    Check out section 3.2 of the [R inferno](https://www.burns-stat.com/pages/Tutor/R_inferno.pdf) – MrFlick Jun 25 '18 at 15:17
  • 1
    What's in `social.dat$group`? Is that a vector of length 2? When asking for help, you should include a simple [reproducible example](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) with sample input and desired output that can be used to test and verify possible solutions. – MrFlick Jun 25 '18 at 15:19
  • @MrFlick social.dat is the dataframe i first listed. I want add the group column to it – jc2525 Jun 25 '18 at 15:21
  • So is the `social.dat$group == thc1` part supposed to be `social.dat$ID == thc1`? You seem to be referencing a column that doesn't exist yet. – MrFlick Jun 25 '18 at 15:24
  • @akrun I edited my post, and used ifelse but still got the same error. – jc2525 Jun 25 '18 at 15:46
  • @MrFlick, see my edited post. I also, changed the column name as you suggested but still got the same error. – jc2525 Jun 25 '18 at 15:46
  • If you want to match more than one value you need `social.dat$ID %in% veh`, not `social.dat$ID == veh`. See the `?match` help page. – MrFlick Jun 25 '18 at 15:48

2 Answers2

3

@akrun suggested ifelse, which works great for one or two nestings. Much past that, and my personal preference is to use dplyr::case_when or a separate data.frame in a merge/join of sorts.

If you are using the "simple case" of assigning consistently by the same fields (id in this case), then the merge/join is my preferred method: it makes maintenance much simpler (IMO). (When I say "consistently by the same fields", I mean that you could have a id1 and id2 fields by which you define the individual records and their applicable groups. Likely too much for your example, so I'll keep this answer at one key merging.)

Three methods (data far below):

Base R

dat2a <- merge(dat, groups, by="id", all.x=TRUE)
dat2a
#   id int group
# 1  1  22   veh
# 2  2  33  thc1
# 3  3  44  <NA>

Note that any id not included in the definition of groups will have NA group. You can assign a default group with this:

dat2a$group[is.na(dat2a$group)] <- "somedefaultgroup"
dat2a
#   id int            group
# 1  1  22              veh
# 2  2  33             thc1
# 3  3  44 somedefaultgroup

dplyr, merge/join

Similar concept, but using dplyr-esque verbs.

library(dplyr)
dat2c <- left_join(dat, groups, by="id") %>%
  mutate(group = if_else(is.na(group), "somedefaultgroup", group))

dplyr::case_when

(This does not use groups as I defined for the merge/join cases.)

In case you really want to do some ladder/nesting of if/else-like statements, case_when is easier to read (and debug) and might be faster, depending on your use-case.

Most direct:

library(dplyr)
dat2b <- dat
dat2b$group <- case_when(
  dat2b$id %in% c("1","5") ~ "veh",
  dat2b$id %in% c("2","6") ~ "thc1",
  TRUE                     ~ "somedefaultgroup"
)

A little easier to read than the previous by using with(...), but functionally identical. (If your "ladder" is much longer, then code-golf (number of characters in the code) can be significantly reduced.)

dat2b <- dat
dat2b$group <- with(dat2b, case_when(
  id %in% c("1","5") ~ "veh",
  id %in% c("2","6") ~ "thc1",
  TRUE               ~ "somedefaultgroup"
))

If you want to use some dplyr verbs, then:

dat2b <- dat
dat2b <- dat2b %>%
  mutate(
    group = case_when(
      id %in% c("1","5") ~ "veh",
      id %in% c("2","6") ~ "thc1",
      TRUE               ~ "somedefaultgroup"
    )
  )

Data

When doing merge/join actions, it's important to use stringsAsFactors=FALSE so that the absence of factor levels (of the newly-assigned groups) is not a problem. (This can be worked around, but ...)

dat <- data.frame(id=c("1","2","3"), int=c(22L,33L,44L),
                  stringsAsFactors=FALSE)

Optional use for the merge examples above:

groups <- data.frame(id=c("1","5","2","6"), group=c("veh","veh","thc1","thc1"),
                  stringsAsFactors=FALSE)
groups
#   id group
# 1  1   veh
# 2  5   veh
# 3  2  thc1
# 4  6  thc1

The premise is that you define one row for each unique id.

r2evans
  • 141,215
  • 6
  • 77
  • 149
0

Thanks to @r2evans the following code worked exactly as I wanted it to (using dplyr::case_when)

social.dat$group = case_when(
  social.dat$ID %in% c("1","5") ~ "veh",
  social.dat$ID %in% c("2","6") ~ "thc1",
  social.dat$ID %in% c("3","7") ~ "thc2",
  social.dat$ID %in% c("4","8") ~ "thc3"
  )

This was the final output of the dataframe

    # of int.   int.   not.int.   ID   group
1      50      218.41   372.16    1     veh
3      33      134.94   158.17    3    thc2
jc2525
  • 141
  • 10