1

I have a dataframe that looks like:

df<- data.frame(region= c("1","1","1","1","1","2","2","2","2","2","2"),loc=c("104","104","104","105","106","107","108", "109", "110", "110", "111"), interact= c("A_B", "B_C", "A_B", "B_C", "B_C", "A_B", "G_H", "I_J", "J_K", "L_M", "M_O"))

And I would like to change it from long to wide format so that the region number becomes the variable (column headings) and the rows become counts of the levels of interact occurring in that region. The caveat is I would also like the first row to be a count of unique loc levels in that region. I will first illustrate the intermediate df:

df2<- data.frame(interact= c("", "A_B", "B_C", "G_H", "I_J", "J_K", 
 "L_M", "M_O"), region1= c("3", "2", "3", "0","0","0","0","0"), 
 region2= c("5", "1", "0", "1","1","1","1","1"))

You'll notice that there were 3 unique levels in loc for region 1 and 5 unique levles in loc for region2; thus the first row of numbers indicate the unique loc count in that region. All rows to follow represent the frequency of each interaction type among all loc in that region. However, I do not want this interact column in the final dataframe so the final output should look like:

output<- data.frame(region1= c("3", "2", "3", "0","0","0","0","0"), 
region2= c("5", "1", "0", "1","1","1","1","1"))

I have tried the following, but I get stuck adding a row with counts of unique loc in each region and I know my current steps are not the most efficient methods:

library(tidyr)
df<- df %>% 
group_by(region, interact) %>% 
summarise(freq = n()) 
data_wide <- spread(df, region, freq)
data_wide<- data_wide[,-1]
Danielle
  • 785
  • 7
  • 15

1 Answers1

1

We can do this in two steps using data.table

library(data.table)
d1 <- dcast(setDT(df)[, .(interact = "", uniqueN(loc)), region], 
         interact ~ paste0('region', region))
rbind(d1, dcast(df, interact ~ paste0('region', region), length))
#   interact region1 region2
#1:                3       5
#2:      A_B       2       1
#3:      B_C       3       0
#4:      G_H       0       1
#5:      I_J       0       1
#6:      J_K       0       1
#7:      L_M       0       1
#8:      M_O       0       1

Or using tidyverse

library(tidyverse)
bind_rows(df %>%
            group_by(region = paste0('region', region)) %>% 
            summarise(interact = "", V1 = n_distinct(loc)) %>% 
            spread(region, V1),
          df %>% 
            group_by(region = paste0('region', region),
                    interact = as.character(interact)) %>%
            summarise(V1 = n()) %>% 
            spread(region, V1, fill = 0))
# A tibble: 8 x 3
#  interact region1 region2
#     <chr>   <dbl>   <dbl>
#1                3       5
#2      A_B       2       1
#3      B_C       3       0
#4      G_H       0       1
#5      I_J       0       1
#6      J_K       0       1
#7      L_M       0       1
#8      M_O       0       1
akrun
  • 874,273
  • 37
  • 540
  • 662