0

I am trying to unnest a dataset that I have produced with pivot_wider, in which there are multiple columns that need to be unnested. On the full dataset, the unnest function is not working (I get an error: >Error: Incompatible lengths: 3, 2.) so I tried a workaround Part of the dataset:

 my_data <-  structure(list(RNAcentral_id = c("URS000000C731", "URS000000C731", 
"URS000000C731", "URS000000C731", "URS000001F3AA", "URS000001F3AA", 
"URS000001F3AA", "URS000001F3AA", "URS000001F3AA", "URS000001F3AA", 
"URS000001F3AA", "URS000001F3AA", "URS000001F3AA", "URS000001F3AA", 
"URS0000023ED8", "URS0000023ED8", "URS0000023ED8", "URS0000023ED8", 
"URS0000023ED8", "URS0000023ED8", "URS0000023ED8", "URS0000023ED8", 
"URS0000023ED8", "URS0000023ED8", "URS0000023ED8", "URS0000023ED8", 
"URS0000050C72", "URS0000050C72", "URS0000050C72", "URS0000050C72", 
"URS0000050C72", "URS0000050C72", "URS0000050C72", "URS0000050C72", 
"URS0000050C72", "URS0000050C72", "URS0000050C72", "URS0000050C72", 
"URS0000050C72", "URS0000050C72", "URS0000050C72", "URS0000050C72", 
"URS00000527A6", "URS00000527A6", "URS00000527A6", "URS00000527A6", 
"URS00000527A6", "URS00000527A6", "URS00000527A6", "URS00000527A6", 
"URS00000527A6", "URS000007CAC8", "URS000007CAC8", "URS000007CAC8", 
"URS000007CAC8", "URS000007CAC8", "URS000007DA54", "URS000007DA54", 
"URS000007DA54", "URS000007DA54", "URS000007DA54", "URS000007DA54", 
"URS000007DA54", "URS000007DA54", "URS000007F1D7", "URS000007F1D7", 
"URS000007F1D7", "URS000007F1D7", "URS000007F1D7", "URS000007F1D7", 
"URS000007F1D7", "URS000007F1D7", "URS000007F1D7", "URS000007F1D7", 
"URS0000088F47", "URS0000088F47", "URS0000088F47", "URS0000088F47", 
"URS0000088F47", "URS0000088F47", "URS0000088F47", "URS00000B589B", 
"URS00000B589B", "URS00000B589B", "URS00000B589B", "URS00000B589B", 
"URS00000B589B", "URS00000B589B"), Database = c("ENSEMBL", "ENSEMBL", 
"ENSEMBL", "GENCODE", "ENSEMBL", "ENSEMBL", "ENSEMBL", "ENSEMBL", 
"ENSEMBL", "ENSEMBL", "ENSEMBL", "ENSEMBL", "GENCODE", "LNCIPEDIA", 
"ENSEMBL", "ENSEMBL", "ENSEMBL", "ENSEMBL", "GENECARDS", "LNCBOOK", 
"LNCIPEDIA", "NONCODE", "NONCODE", "NONCODE", "NONCODE", "NONCODE", 
"ENSEMBL", "ENSEMBL", "ENSEMBL", "ENSEMBL", "ENSEMBL", "ENSEMBL", 
"ENSEMBL", "GENCODE", "LNCBOOK", "NONCODE", "NONCODE", "NONCODE", 
"NONCODE", "NONCODE", "NONCODE", "NONCODE", "ENSEMBL", "ENSEMBL", 
"ENSEMBL", "GENCODE", "GENECARDS", "GENECARDS", "LNCBOOK", "LNCIPEDIA", 
"NONCODE", "ENSEMBL", "ENSEMBL", "ENSEMBL", "GENCODE", "NONCODE", 
"ENSEMBL", "ENSEMBL", "ENSEMBL", "ENSEMBL", "ENSEMBL", "GENCODE", 
"LNCBOOK", "NONCODE", "ENSEMBL", "ENSEMBL", "ENSEMBL", "ENSEMBL", 
"GENCODE", "LNCBOOK", "NONCODE", "NONCODE", "NONCODE", "NONCODE", 
"ENSEMBL", "ENSEMBL", "ENSEMBL", "ENSEMBL", "GENCODE", "GENECARDS", 
"LNCIPEDIA", "ENA", "ENSEMBL", "ENSEMBL", "ENSEMBL", "ENSEMBL", 
"ENSEMBL", "GENCODE"), RNA_type = c("lncRNA", "lncRNA", "lncRNA", 
"lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", 
"lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", 
"lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", 
"lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", 
"lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", 
"lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", 
"lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", 
"lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", 
"lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", 
"lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", 
"lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", "lncRNA", 
"lncRNA", "snoRNA", "snoRNA", "snoRNA", "snoRNA", "snoRNA", "snoRNA", 
"snoRNA"), gene_name = c("ENSG00000250666.1", "ENSG00000281830.1", 
"ENSG00000281377.1", "LINC01596", "ENSG00000242086.8", "ENSG00000280512.2", 
"ENSG00000281603.2", "ENSG00000281060.2", "ENSG00000281794.2", 
"ENSG00000281915.2", "ENSG00000280993.2", "ENSG00000282953.1", 
"MUC20-OT1", "lnc-MUC20-67", "ENSG00000235273.1", "ENSG00000233950.1", 
"ENSG00000230089.1", "ENSG00000225188.1", "LOC101929006", "HSALNG0049045", 
"lnc-OR14J1-2", "NONHSAG043350.2", "NONHSAG045640.2", "NONHSAG045830.2", 
"NONHSAG046018.2", "NONHSAG046538.2", "ENSG00000231860.1", "ENSG00000224328.1", 
"ENSG00000236766.1", "ENSG00000224508.1", "ENSG00000236522.1", 
"ENSG00000229681.1", "ENSG00000233883.1", "MDC1-AS1", "HSALNG0049184", 
"NONHSAG043427.2", "NONHSAG045580.2", "NONHSAG045701.2", "NONHSAG045891.2", 
"NONHSAG046074.2", "NONHSAG046228.2", "NONHSAG046589.2", "ENSG00000249981.1", 
"ENSG00000276297.1", "ENSG00000280619.1", "AC145141.1", "LOC107987420", 
"LOC107987434", "HSALNG0042531", "lnc-BDP1-1", "NONHSAG040656.2", 
"ENSG00000242086.8", "ENSG00000280512.2", "ENSG00000281794.2", 
"MUC20-OT1", "NONHSAG037073.2", "ENSG00000242086.8", "ENSG00000280512.2", 
"ENSG00000281794.2", "ENSG00000281060.2", "ENSG00000282953.1", 
"MUC20-OT1", "HSALNG0031832", "NONHSAG037073.2", "ENSG00000224835.1", 
"ENSG00000227198.1", "ENSG00000233169.1", "ENSG00000225390.1", 
"C6orf47-AS1", "HSALNG0049305", "NONHSAG043504.2", "NONHSAG046125.2", 
"NONHSAG046270.2", "NONHSAG046461.2", "ENSG00000272566.1", "ENSG00000280590.1", 
"ENSG00000280853.1", "ENSG00000281916.1", "AF250324.1", "ENSG00000272566", 
"lnc-FRG2-13", "ACA38 snoRNA", "ENSG00000200816.1", "ENSG00000266847.1", 
"ENSG00000263994.1", "ENSG00000264153.1", "ENSG00000263879.1", 
"SNORA38")), row.names = c(NA, -88L), class = c("tbl_df", "tbl", 
"data.frame"), spec = structure(list(cols = list(RNAcentral_id = structure(list(), class = c("collector_character", 
"collector")), Database = structure(list(), class = c("collector_character", 
"collector")), external_id = structure(list(), class = c("collector_character", 
"collector")), NCBI_taxon_id = structure(list(), class = c("collector_double", 
"collector")), RNA_type = structure(list(), class = c("collector_character", 
"collector")), gene_name = structure(list(), class = c("collector_character", 
"collector"))), default = structure(list(), class = c("collector_guess", 
"collector")), delim = "\t"), class = "col_spec"))

the one with the error:

my_data %>% 
          pivot_wider(names_from = Database, values_from = c(gene_name)) %>% 
  unnest()

my workaround try:

mynested_data <- my_data %>% 
         pivot_wider(names_from = Database, values_from = c(gene_name))

c("ENSEMBL", "GENCODE", "NONCODE", "ENA", "GENECARDS", "LNCBOOK", 
  "LNCIPEDIA") %>% 
   set_names(.) %>% 
   map(~ mynested_data %>%  
         unnest_wider(.x, names_sep = "_") %>%
         unite(col = !!.x, vars(starts_with(!!quo(.x))), sep = ";"))

Error: Must subset columns with a valid subscript vector.
x Subscript has the wrong type `quosures`.
\u2139 It must be numeric or character.
Run `rlang::last_error()` to see where the error occurred.

In unite I tried also to use the col = .x or col = !!quo(.x) but I get the same error.

Edit1 What I expect to get as a result I'm doing all this in order to get a tibble that has per row(entry) one RNAcentral_id and the list "columns" made strings with several entries concatenated with a separator ";". ENSEMBL one column, GENCODE one column etc

K Y
  • 198
  • 2
  • 16

1 Answers1

1

We can use pivot_wider directly here :

tidyr::pivot_wider(my_data, names_from = Database, 
                    values_from = gene_name, values_fn = toString)

Or in data.table with dcast :

library(data.table)
dcast(setDT(my_data), RNA_type + RNAcentral_id~ Database, 
      value.var = 'gene_name', fun.aggregate = toString)
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
  • Sorry, I didn't say about the result. I need to have one row per RNAcentral_id, that's why I need it to use unnest_wider so as to use after unite to the columns that are made and recreate the starting columns. – K Y Sep 16 '20 at 11:10
  • 1
    @KGeles Ohh..I see. So maybe you need `tidyr::pivot_wider(my_data, names_from = Database, values_from = gene_name, values_fn = toString)` ? – Ronak Shah Sep 16 '20 at 12:07
  • My goodness, what kind of wizardry is this :P. That's it.... Could you also include it in your answer. as an edit? – K Y Sep 16 '20 at 12:13