1

I have data sets in a list where a classification has been assigned as a word or a phrase. I would like to find the most common classification in a sequence.

Here's my data:

## Create some random data
id <- seq(0, 8, .5)
class <- c("Frequent", "Often","Occasional", "Often", "Not Seen", "Frequent", "Rare", "Occasional", "Very Rare", "Absent",
           "Frequent", "Frequent", "Very Rare", "Often", "Not Seen", "Occasional", "Rare")
df <- data.frame(id, class)
print(df)

## Another two data sets to form a list:
id <- seq(0, 8, .5)
class <- c("Frequent", "Often","Occasional", "Often", "Not Seen", "Frequent", "Rare", "Occasional", "Very Rare", "Absent",
            "Not Seen", "Occasional", "Rare","Occasional", "Often", "Very Rare","Occasional")
df1 <- data.frame(id, class)

id <- seq(0, 8, .5)
class <- c("Frequent", "Often","Occasional", "Often", "Not Seen", "Frequent", "Rare", "Occasional", "Very Rare", "Absent",
           "Frequent", "Often","Occasional", "Often", "Not Seen", "Frequent", "Rare")
df2 <- data.frame(id, class)

## make a list
filez <- list(df, df1, df2)

I get the output:

> print(df)
    id      class
1  0.0   Frequent
2  0.5      Often
3  1.0 Occasional
4  1.5      Often
5  2.0   Not Seen
6  2.5   Frequent
7  3.0       Rare
8  3.5 Occasional
9  4.0  Very Rare
10 4.5     Absent
11 5.0   Frequent
12 5.5   Frequent
13 6.0  Very Rare
14 6.5      Often
15 7.0   Not Seen
16 7.5 Occasional
17 8.0       Rare

I then assign a numerical value to a class (To me it makes more sense to work with numbers than character strings)

for (i in 1:length(filez)) {
filez$classnum <- as.numeric(revalue(df$class, 
                       c("Frequent"= 1, "Often"= 2,"Occasional" =3, "Rare"= 4, 
                         "Very Rare"= 5, "Not Seen" = 6, "Absent" = 7))
}

How would I now go about finding the most common classification per sequence group? I've tried using cut and aggregate and I feel like I'm almost there, but the output returns the groups with the word 'numeric'. What can I do to get it to give me the number between 1-7?

new_seq <- seq(0,8,2)
for(i in 1:length(filez)){
  aggr_func <- function(filez){
    filez$group <- cut(filez[,1], new_seq)
    output <- aggregate(filez[,3], by = list(filez$group), FUN =mode)
    return(output)
  }
  final <- lapply(filez, aggr_func)
}


> print(final[[1]])
  Group.1       x
1   (0,2] numeric
2   (2,4] numeric
3   (4,6] numeric
4   (6,8] numeric
desertnaut
  • 57,590
  • 26
  • 140
  • 166
  • 1
    R's `mode` function returns an object's type, not the statistical mode of its values. To calculate the mode, see: https://stackoverflow.com/questions/2547402/how-to-find-the-statistical-mode – I_O May 16 '23 at 10:52

1 Answers1

0

Use lapply instead of for-loops, and as mentioned in comments we need custom "mode" function.

# rowbind and convert "class" to numeric
out <- do.call(rbind, filez)
out$classnum <- as.integer(
  factor(out$class, levels = c("Frequent", "Often","Occasional", "Rare",
                               "Very Rare", "Not Seen", "Absent")))

Now get the modes, using 2 functions from this post: How to find the statistical mode?

#group by Mode
aggregate(out[ "classnum" ], list(cut(out$id, seq(0, 8, 2))), FUN = Mode)
#    Group.1 classnum
# 1   (0,2]        2
# 2   (2,4]        1
# 3   (4,6]        7
# 4   (6,8]        2
#group by Modes
aggregate(out[ "classnum" ], list(cut(out$id, seq(0, 8, 2))), FUN = Modes)
#   Group.1   classnum
# 1   (0,2]          2
# 2   (2,4] 1, 4, 3, 5
# 3   (4,6]       7, 1
# 4   (6,8]       2, 3
desertnaut
  • 57,590
  • 26
  • 140
  • 166
zx8754
  • 52,746
  • 12
  • 114
  • 209