1

I have a dataset with multiple tables in the same Excel sheet that I'd like to combine. The sheet's data is formatted like this:

group_no      group     variable       marker1     marker2     marker3
1             GR-01-1   1              -           -           A
1             GR-01-1   2              B           A           B
1             GR-01-1   3              A           B           -
1             GR-01-1   4              -           -           -
1             GR-01-1   5              B           -           B
group_no.     group     variable       marker 4    marker5     marker6    marker7
2             GR-01-1a  1              B           -           A          -
2             GR-01-1a  2              B           A           -          B
2             GR-01-1a  3              A           B           -          B
2             GR-01-1a  4              A           A           A          A
group_no      group     variable       marker8
3             GR-01-2   1              B
3             GR-01-2   2              A
3             GR-01-2   3              -
group_no      group     variable       marker9     marker10
4             GR-02-1   1              B           A
4             GR-02-1   2              A           A
4             GR-02-1   3              -           B
4             GR-02-1   4              B           B
4             GR-02-1   5              A           B
4             GR-02-1   6              -           A
group_no      group     variable       marker11    marker12    marker13
5             GR-02-2   d              B           B           A
5             GR-02-2   e              A           B           B
5             GR-02-2   f              B           -           -
5             GR-02-2   g              -           B           -

As can be seen above, the supplier of the data did not separate out the names of the markers into unique columns. All the tables are stacked on top of one another. What I want to do is transform the data into this:

marker_name    group_no  group          1    2    3    4    5    6    a    b    c    d    e    f    g
marker1        GR-01-1   1              -    B    A    -    B
marker2        GR-01-1   1              -    A    B    -    -
marker3        GR-01-1   1              A    B    -    -    B
marker4        GR-01-1a  2              B    B    A    A
marker5        GR-01-1a  2              -    A    B    A
marker6        GR-01-1a  2              A    -    -    A
marker7        GR-01-1a  2              -    B    B    A
marker8        GR-01-2   3              B    A    -
marker9        GR-02-1   4              B    A    -    B    A    -
marker10       GR-02-1   4              A    A    B    B    B    A
marker11       GR-02-2   5                                                           B    A    B    -
marker12       GR-02-2   5                                                           B    B    -    B
marker13       GR-02-2   5                                                           A    B    -    -

Is there a way to do this in R? A loop maybe? I started off with transposing in R, but that didn't work and I wonder if there may have been a way to read the data in by group_no. But no luck. I am still a novice, I guess. There are 917 tables in my data set in the single sheet. I'm hoping I don't have to assign each to a new R object one-by-one and then rbind or cbind.

M--
  • 25,431
  • 8
  • 61
  • 93
  • 1
    You should provide a [reproducible example](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example). Can you provide the data when you read it into r (using `dput` function)? – M-- Feb 20 '20 at 02:50
  • 1
    Thanks for the link M--. I'll try my best to give reproducible examples in the future. But I appreciate that answer you gave below. – Daniel Garza Feb 20 '20 at 22:06

1 Answers1

1
#Reading data into R
library(readr) 
df1 <- read_csv("C:/Users/userm/Desktop/df1.csv", 
                col_names = FALSE)
#Loading libraries:
library(plyr)
library(tidyr)
spl <- diff(c(which(df1$X1 == 'group_no'), nrow(df1))) #defining the split for each table

#Splitting to a list of dataframes, where each dataframe is one of the tables of interest:
dl2 <- split(df1, 
            c(rep(1, spl[1]),
              rep(2, spl[2]),
              rep(3, spl[3]),
              rep(4, spl[4]),
              rep(5, spl[5])))
#Assigning the first row as the dataframe header:
dl3 <- lapply(dl2, function(d){colnames(d) <- d[1,]; d <- d[-1,]})

#Removing columns with All NAs:
dl4 <- lapply(dl3, function(d){d <- d[,colSums(is.na(d))<nrow(d)]})

#Binding dataframes with different number of columns (plyr):
df5 <- rbind.fill(dl4)
#Converting from Wide to Long (tidyr):
df6 <- pivot_longer(df5, -c("group_no", "group", "variable"), 
                          names_to = "marker_name", values_to = "value")

#Convert from Long to Wide with a different arrangement (tidyr):
df7 <- pivot_wider(df6, names_from = "variable", values_from = "value")
#> # A tibble: 65 x 16
#>    group_no group marker_name a     b     c     `4`   `5`   g     `1`  
#>    <chr>    <chr> <chr>       <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#>  1 1        GR-0~ marker1     -     B     A     -     B     <NA>  <NA> 
#>  2 1        GR-0~ marker2     -     A     B     -     -     <NA>  <NA> 
#>  3 1        GR-0~ marker3     A     B     -     -     B     <NA>  <NA> 
#>  4 1        GR-0~ marker4     <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#>  5 1        GR-0~ marker5     <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#>  6 1        GR-0~ marker6     <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#>  7 1        GR-0~ marker7     <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#>  8 1        GR-0~ marker8     <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#>  9 1        GR-0~ marker9     <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#> 10 1        GR-0~ marker10    <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#> # ... with 55 more rows, and 6 more variables: `2` <chr>, `3` <chr>,
#> #   `6` <chr>, d <chr>, e <chr>, f <chr>

Data:

df1 <- structure(list(X1 = c("group_no", "1", "1", "1", "1", "1", "group_no", 
"2", "2", "2", "2", "group_no", "3", "3", "3", "group_no", "4", 
"4", "4", "4", "4", "4", "group_no", "5", "5", "5", "5"), X2 = c("group", 
"GR-01-1", "GR-01-1", "GR-01-1", "GR-01-1", "GR-01-1", "group", 
"GR-01-1a", "GR-01-1a", "GR-01-1a", "GR-01-1a", "group", "GR-01-2", 
"GR-01-2", "GR-01-2", "group", "GR-02-1", "GR-02-1", "GR-02-1", 
"GR-02-1", "GR-02-1", "GR-02-1", "group", "GR-02-2", "GR-02-2", 
"GR-02-2", "GR-02-2"), X3 = c("variable", "a", "b", "c", "4", 
"5", "variable", "1", "2", "3", "4", "variable", "1", "2", "3", 
"variable", "1", "2", "3", "4", "5", "6", "variable", "d", "e", 
"f", "g"), X4 = c("marker1", "-", "B", "A", "-", "B", "marker4", 
"B", "B", "A", "A", "marker8", "B", "A", "-", "marker9", "B", 
"A", "-", "B", "A", "-", "marker11", "B", "A", "B", "-"), X5 = c("marker2", 
"-", "A", "B", "-", "-", "marker5", "-", "A", "B", "A", NA, NA, 
NA, NA, "marker10", "A", "A", "B", "B", "B", "A", "marker12", 
"B", "B", "-", "B"), X6 = c("marker3", "A", "B", "-", "-", "B", 
"marker6", "A", "-", "-", "A", NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, "marker13", "A", "B", "-", "-"), X7 = c(NA, NA, NA, 
NA, NA, NA, "marker7", "-", "B", "B", "A", NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = c("spec_tbl_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -27L), spec = structure(list(
    cols = list(X1 = structure(list(), class = c("collector_character", 
    "collector")), X2 = structure(list(), class = c("collector_character", 
    "collector")), X3 = structure(list(), class = c("collector_character", 
    "collector")), X4 = structure(list(), class = c("collector_character", 
    "collector")), X5 = structure(list(), class = c("collector_character", 
    "collector")), X6 = structure(list(), class = c("collector_character", 
    "collector")), X7 = structure(list(), class = c("collector_character", 
    "collector"))), default = structure(list(), class = c("collector_guess", 
    "collector")), skip = 0), class = "col_spec"))
   X1       X2       X3       X4      X5      X6      X7     
 1 group_no group    variable marker1 marker2 marker3 NA     
 2 1        GR-01-1  a        -       -       A       NA     
 3 1        GR-01-1  b        B       A       B       NA     
 4 1        GR-01-1  c        A       B       -       NA     
 5 1        GR-01-1  4        -       -       -       NA     
 6 1        GR-01-1  5        B       -       B       NA     
 7 group_no group    variable marker4 marker5 marker6 marker7
 8 2        GR-01-1a 1        B       -       A       -      
 9 2        GR-01-1a 2        B       A       -       B      
10 2        GR-01-1a 3        A       B       -       B    
# ... with 17 more rows
M--
  • 25,431
  • 8
  • 61
  • 93