0

Converting factors to numeric has been dealt with multiple times, but my issue is when I have multiple numbers within the factor. For instance, here is a small subset of my data.frame:

                    AF      AC   AN   EAS_AF         AMR_AF
1          0.000199681       1 5008    0.001            0.0
2           0.00319489      16 5008      0.0            0.0
3 0.024361, 0.00479233 122, 24 5008 0.0, 0.0 0.0043, 0.0014
4           0.00439297      22 5008      0.0         0.0014
5          0.000798722       4 5008      0.0            0.0

Normally I would use the as.numeric and levels functions in combination to convert these factors into numbers. However, row three has two numbers in each entry, and therefore I get an NA when attempting this method on these variables. Is there any way to get round this? I have too many of these such cases to manually pluck them out.

My overall objective is to test whether each entry in each of these columns is greater than 0 (so if there are two numbers, I would test both), which is why I am attempting to convert into numeric in the first place. If there is any other smarter way around this problem I'd be willing to try it.

As requested, below is the dput of a reduced version of my data frame (taking only the first 10 rows).

structure(list(CHROM = c(10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 
10L), POS = c(180109L, 209892L, 221335L, 239445L, 246927L, 246928L, 
246933L, 246955L, 246970L), ID = structure(c(6L, 4L, 1L, 3L, 
5L, 9L, 2L, 7L, 8L), .Label = c("rs143013573", "rs1431845", "rs145483680", 
"rs151111729", "rs547339499", "rs547699134", "rs556577288", "rs575589407", 
"rs72770983"), class = "factor"), REF = structure(c(3L, 2L, 2L, 
3L, 1L, 1L, 3L, 2L, 1L), .Label = c("A", "C", "G"), class = "factor"), 
    ALT = structure(c(1L, 2L, 3L, 1L, 2L, 2L, 1L, 4L, 2L), .Label = c("A", 
    "G", "G, T", "T"), class = "factor"), AF = structure(c(1L, 
    5L, 7L, 6L, 2L, 4L, 8L, 3L, 1L), .Label = c("0.000199681", 
    "0.000798722", "0.000998403", "0.00239617", "0.00319489", 
    "0.00439297", "0.024361, 0.00479233", "0.220248"), class = "factor"), 
    AC = structure(c(1L, 5L, 4L, 6L, 7L, 3L, 2L, 8L, 1L), .Label = c("1", 
    "1103", "12", "122, 24", "16", "22", "4", "5"), class = "factor"), 
    AN = c(5008L, 5008L, 5008L, 5008L, 5008L, 5008L, 5008L, 5008L, 
    5008L), EAS_AF = structure(c(3L, 1L, 2L, 1L, 1L, 3L, 4L, 
    1L, 1L), .Label = c("0.0", "0.0, 0.0", "0.001", "0.248"), class = "factor"), 
    AMR_AF = structure(c(1L, 1L, 3L, 2L, 1L, 2L, 4L, 1L, 2L), .Label = c("0.0", 
    "0.0014", "0.0043, 0.0014", "0.1599"), class = "factor"), 
    AFR_AF = structure(c(1L, 3L, 5L, 4L, 2L, 1L, 6L, 1L, 1L), .Label = c("0.0", 
    "0.003", "0.0121", "0.0159", "0.09, 0.0", "0.1611"), class = "factor"), 
    EUR_AF = structure(c(1L, 1L, 2L, 1L, 1L, 3L, 4L, 1L, 1L), .Label = c("0.0", 
    "0.0, 0.0089", "0.0089", "0.2495"), class = "factor"), SAS_AF = structure(c(1L, 
    1L, 2L, 1L, 1L, 3L, 5L, 4L, 1L), .Label = c("0.0", "0.0, 0.0143", 
    "0.001", "0.0051", "0.2843"), class = "factor"), consequence = structure(c(2L, 
    1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("stop_gained", 
    "synonymous_variant"), class = "factor"), gene = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ZMYND11", class = "factor"), 
    accession = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), .Label = "NM_006624.5", class = "factor"), gene_type = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "protein_coding", class = "factor")), .Names = c("CHROM", 
"POS", "ID", "REF", "ALT", "AF", "AC", "AN", "EAS_AF", "AMR_AF", 
"AFR_AF", "EUR_AF", "SAS_AF", "consequence", "gene", "accession", 
"gene_type"), class = "data.frame", row.names = c(NA, -9L)) 
Jaap
  • 81,064
  • 34
  • 182
  • 193
spiral01
  • 545
  • 2
  • 17

1 Answers1

1

Here's how to do that with separate_rows from tidyr:

library(tidyr)
df %>% 
  separate_rows(ALT,AF,AC,EAS_AF,AMR_AF,AFR_AF,EUR_AF,SAS_AF, convert=TRUE)

   CHROM    POS          ID REF   AN        consequence    gene   accession      gene_type ALT          AF   AC EAS_AF AMR_AF AFR_AF
1     10 180109 rs547699134   G 5008 synonymous_variant ZMYND11 NM_006624.5 protein_coding   A 0.000199681    1  0.001 0.0000 0.0000
2     10 209892 rs151111729   C 5008        stop_gained ZMYND11 NM_006624.5 protein_coding   G 0.003194890   16  0.000 0.0000 0.0121
3     10 221335 rs143013573   C 5008 synonymous_variant ZMYND11 NM_006624.5 protein_coding   G 0.024361000  122  0.000 0.0043 0.0900
4     10 221335 rs143013573   C 5008 synonymous_variant ZMYND11 NM_006624.5 protein_coding   T 0.004792330   24  0.000 0.0014 0.0000
5     10 239445 rs145483680   G 5008 synonymous_variant ZMYND11 NM_006624.5 protein_coding   A 0.004392970   22  0.000 0.0014 0.0159
6     10 246927 rs547339499   A 5008 synonymous_variant ZMYND11 NM_006624.5 protein_coding   G 0.000798722    4  0.000 0.0000 0.0030
7     10 246928  rs72770983   A 5008 synonymous_variant ZMYND11 NM_006624.5 protein_coding   G 0.002396170   12  0.001 0.0014 0.0000
8     10 246933   rs1431845   G 5008 synonymous_variant ZMYND11 NM_006624.5 protein_coding   A 0.220248000 1103  0.248 0.1599 0.1611
9     10 246955 rs556577288   C 5008 synonymous_variant ZMYND11 NM_006624.5 protein_coding   T 0.000998403    5  0.000 0.0000 0.0000
10    10 246970 rs575589407   A 5008 synonymous_variant ZMYND11 NM_006624.5 protein_coding   G 0.000199681    1  0.000 0.0014 0.0000
   EUR_AF SAS_AF
1  0.0000 0.0000
2  0.0000 0.0000
3  0.0000 0.0000
4  0.0089 0.0143
5  0.0000 0.0000
6  0.0000 0.0000
7  0.0089 0.0010
8  0.2495 0.2843
9  0.0000 0.0051
10 0.0000 0.0000
Pierre Lapointe
  • 16,017
  • 2
  • 43
  • 56