-1

I have a dataset as below, which has many columns. There are some columns whose headings are :

baked_hamburgur,spinach,mashed_potato,cabbages,jello,rolls,brown,milk,coffee,water,cakes,vanilla,chocolate,fruitsalad

There are other columns as well, but I am only interested as of now in the above columns.

the value in each row of these columns is either: yes, or no.

A screenshot of this data is as under, as I am not able to attach/share this file in the question itself.

My Dataset with many columns for Data Analysis of Ill Patients

The dput(head()) output is as under:

> dput(head(illness_data))

structure(list(Age = structure(c(18L, 26L, 22L, 25L, 29L, 13L
), .Label = c("10", "106", "11", "12", "14", "15", "16", "17", 
"18", "19", "2", "20", "22", "23", "24", "25", "26", "27", "28", 
"30", "31", "32", "33", "34", "36", "38", "39", "4", "42", "43", 
"44", "45", "46", "48", "5", "7", "8", "9", "seven"), class = "factor"), 
    sex = structure(c(3L, 2L, 3L, 3L, 2L, 3L), .Label = c("-1", 
    "Female", "Male"), class = "factor"), timesupper = c(2000L, 
    1830L, 1830L, 1930L, 1930L, 1930L), ill = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "yes", class = "factor"), onsetdate = structure(c(4L, 
    4L, 4L, 1L, 1L, 4L), .Label = c("18-Apr", "18-Jun", "18/4", 
    "19-Apr"), class = "factor"), onsettime = c(30L, 30L, 30L, 
    2230L, 2230L, 200L), baked_hamburgur = structure(c(2L, 2L, 
    2L, 2L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), 
    spinach = structure(c(2L, 2L, 2L, 2L, 2L, 1L), .Label = c("no", 
    "yes"), class = "factor"), mashed_potato = structure(c(2L, 
    2L, 1L, 1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), 
    cabbages = structure(c(1L, 2L, 1L, 2L, 1L, 1L), .Label = c("no", 
    "yes"), class = "factor"), jello = structure(c(1L, 1L, 1L, 
    2L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), 
    rolls = structure(c(2L, 1L, 1L, 1L, 2L, 1L), .Label = c("no", 
    "yes"), class = "factor"), brown = structure(c(1L, 1L, 1L, 
    1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), 
    milk = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("no", 
    "yes"), class = "factor"), coffee = structure(c(2L, 2L, 2L, 
    1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), 
    water = structure(c(1L, 1L, 1L, 2L, 2L, 1L), .Label = c("no", 
    "yes"), class = "factor"), cakes = structure(c(1L, 1L, 2L, 
    1L, 1L, 1L), .Label = c("no", "yes"), class = "factor"), 
    vanilla = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("no", 
    "yes"), class = "factor"), chocolate = structure(c(1L, 2L, 
    2L, 1L, 1L, 2L), .Label = c("no", "yes"), class = "factor"), 
    fruitsalad = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("no", 
    "yes", "yes</pre></body></html>Ztext/plain\b\v\035(F]l~Ó_Ý\026R\002\001"
    ), class = "factor")), .Names = c("Age", "sex", "timesupper", 
"ill", "onsetdate", "onsettime", "baked_hamburgur", "spinach", 
"mashed_potato", "cabbages", "jello", "rolls", "brown", "milk", 
"coffee", "water", "cakes", "vanilla", "chocolate", "fruitsalad"
), row.names = c(NA, 6L), class = "data.frame")

A complete dput command output is as under:

> dput(illness_data)

structure(list(Age = structure(c(18L, 26L, 22L, 25L, 29L, 13L, 
36L, 8L, 11L, 7L, 24L, 10L, 8L, 35L, 34L, 6L, 22L, 39L, 12L, 
9L, 36L, 17L, 9L, 20L, 37L, 27L, 32L, 30L, 21L, 24L, 3L, 18L, 
33L, 16L, 5L, 31L, 28L, 14L, 19L, 38L, 2L, 4L, 23L, 1L, 18L, 
15L), .Label = c("10", "106", "11", "12", "14", "15", "16", "17", 
"18", "19", "2", "20", "22", "23", "24", "25", "26", "27", "28", 
"30", "31", "32", "33", "34", "36", "38", "39", "4", "42", "43", 
"44", "45", "46", "48", "5", "7", "8", "9", "seven"), class = "factor"), 
    sex = structure(c(3L, 2L, 3L, 3L, 2L, 3L, 3L, 3L, 2L, 3L, 
    3L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 2L, 2L, 1L, 3L, 3L, 3L, 
    2L, 2L, 3L, 2L, 3L, 3L, 3L, 2L, 3L, 2L, 2L, 3L, 3L, 3L, 3L, 
    3L, 3L, 2L, 3L, 2L, 3L), .Label = c("-1", "Female", "Male"
    ), class = "factor"), timesupper = c(2000L, 1830L, 1830L, 
    1930L, 1930L, 1930L, 2200L, 1900L, 1930L, NA, NA, NA, NA, 
    2200L, NA, NA, NA, 2200L, NA, NA, 2200L, 2200L, NA, NA, 2200L, 
    NA, NA, NA, NA, NA, 1900L, NA, 1100L, NA, NA, NA, 2200L, 
    1930L, 1930L, 2200L, NA, NA, 1930L, 1930L, NA, NA), ill = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), .Label = "yes", class = "factor"), onsetdate = structure(c(4L, 
    4L, 4L, 1L, 1L, 4L, 4L, 2L, 4L, 4L, 4L, 1L, 1L, 4L, 1L, 3L, 
    1L, 4L, 1L, 1L, 4L, 4L, 1L, 1L, 4L, 1L, 4L, 4L, 1L, 4L, 4L, 
    1L, 1L, 1L, 1L, 1L, 4L, 4L, 1L, 4L, 4L, 4L, 4L, 1L, 4L, 1L
    ), .Label = c("18-Apr", "18-Jun", "18/4", "19-Apr"), class = "factor"), 
    onsettime = c(30L, 30L, 30L, 2230L, 2230L, 200L, 100L, 2300L, 
    200L, 1030L, 30L, 2215L, 2200L, 100L, 2300L, 2145L, 2145L, 
    100L, 2300L, 2100L, 100L, 100L, 2115L, 2330L, 100L, 2130L, 
    230L, 200L, 2130L, 30L, 100L, 2230L, 1500L, 2400L, 2300L, 
    2230L, 100L, 230L, 2330L, 100L, 30L, 30L, 100L, 2400L, 215L, 
    2300L), baked_hamburgur = structure(c(2L, 2L, 2L, 2L, 2L, 
    1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 
    1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 
    2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("no", 
    "yes"), class = "factor"), spinach = structure(c(2L, 2L, 
    2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 
    1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L), .Label = c("no", 
    "yes"), class = "factor"), mashed_potato = structure(c(2L, 
    2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 
    2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 
    2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), cabbages = structure(c(1L, 
    2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 
    2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L
    ), .Label = c("no", "yes"), class = "factor"), jello = structure(c(1L, 
    1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 
    1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 
    1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), rolls = structure(c(2L, 
    1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 
    2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 
    2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), brown = structure(c(1L, 
    1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 
    2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), milk = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 
    1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), .Label = c("no", "yes"), class = "factor"), coffee = structure(c(2L, 
    2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 
    1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), water = structure(c(1L, 
    1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 
    2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 
    1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L
    ), .Label = c("no", "yes"), class = "factor"), cakes = structure(c(1L, 
    1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), vanilla = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
    ), .Label = c("no", "yes"), class = "factor"), chocolate = structure(c(1L, 
    2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 
    1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, NA, 1L, 1L, 
    2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L
    ), .Label = c("no", "yes"), class = "factor"), fruitsalad = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L
    ), .Label = c("no", "yes", "yes</pre></body></html>Ztext/plain\b\v\035(F]l~Ó_Ý\026R\002\001"
    ), class = "factor")), .Names = c("Age", "sex", "timesupper", 
"ill", "onsetdate", "onsettime", "baked_hamburgur", "spinach", 
"mashed_potato", "cabbages", "jello", "rolls", "brown", "milk", 
"coffee", "water", "cakes", "vanilla", "chocolate", "fruitsalad"
), class = "data.frame", row.names = c(NA, -46L))

R has correctly read these columns as Factor type variable(s).

Now, each of these columns correspond to what every ill patient in a hospital has consumed.

I'd like to know the most consumed food item by ill patients, using R.

Kindly advise a good way to do so. Thanks!

Note, I have not tried any other options, than the ones mentioned in this URL below. However, I could not make it work.

Count Factor Columns Using R

  • 1
    Please show a small reproducible example with `dput` and expected output – akrun Dec 12 '19 at 06:28
  • 1
    Also include what have you tried to code so far and how is it not working – Andreas Dec 12 '19 at 06:29
  • Hello, @Andreas I have not tried any code so far other than what I found from this URL https://stackoverflow.com/questions/26114525/how-to-count-how-many-values-per-level-in-a-given-factor. However, none of the methods mentioned here works in my case. I have updated my question description as well. – Suddhasatwa Bhaumik Dec 13 '19 at 09:37

2 Answers2

1

I am not completely sure what you are looking for, but this will calculate how often foods are consumed (using the tidyverse package):

library(tidyverse)
illness_data_summed <- illness_data %>%
  mutate_at(vars(-Age, -sex, -timesupper,-onsetdate,-onsettime), ~ifelse(. == "yes", 1,0)) %>% 
  summarise_at(vars(-Age, -sex, -timesupper,-onsetdate,-onsettime, -ill), ~sum(., na.rm = TRUE)) 

illness_data_summed[which(illness_data_summed == max(illness_data_summed))]

So first I convert the yes into 1 and no into 0, which makes the sum a representation of the number of times the specific food was eaten. I do it for all columns except those you are not interested in (indicated by the - in vars) but you can also reverse that if that is desirable (e.g., when the number of vars yo uwant to convert is lower than those you do not want to convert).

The last part will result in:

  vanilla
1      43
Annet
  • 846
  • 3
  • 14
1

Since all the rows have ill = 'yes' we can count in each column the number of values with yes in it. A base R approach could be :

head(sort(colSums(illness_data[7:20] == "yes"), decreasing = TRUE), 5)

#        vanilla baked_hamburgur           cakes         spinach   mashed_potato 
#             43              29              27              26              23 

I have selected columns 7 to 20 because those are the only columns where food item is present. Also I have selected only top 5 values, you can select any value by changing the number 5 in head command.

Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
  • This is by far the most simplest and accurate answer! BTW, is there a way to represent the output as a dataframe and then plot it using plot() function in R? – Suddhasatwa Bhaumik Dec 23 '19 at 09:47
  • @SuddhasatwaBhaumik yes, you can do `plot(stack(head(sort(colSums(illness_data[7:20] == "yes"), decreasing = TRUE), 5))[2:1])` – Ronak Shah Dec 23 '19 at 10:03