How do I calculate semi-colon separated values within a cell?

Question

I have a table that looks like this:

> head(dt)
               variant_id           transcript_id        HH     HNL  NLNL
1: chr10_60842447_A_G_b38 chr10_60871326_60871443 32968;685 1440;20 337;1
2: chr10_60846892_G_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1
3: chr10_60847284_C_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1
4: chr10_60849980_T_C_b38 chr10_60871326_60871443 33157;690 1251;15 337;1
5: chr10_60850566_A_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1
6: chr10_60852394_C_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1

What I would like to do is take values in column HH and divide the numbers before the semi-colon by the numbers after the semi-colon. For example, for the first row, I would like to do 32968/685 (which would be 48.13). Then, I would like to do the same for the values in column NLNL (so for the first row that would be 337), and then I would like to subtract the value found from column HH from the value in column NLNL, so 337-48.13 = 228.87. I would then like to take that value in place it into a new column called diff for all rows.

How would I go about doing this? I can pretty easily figure out how to divide the values of one column with another and put the result in a new column, but I don't know how to extract semi-colon separated values from within a cell and manipulate them.

Do none of [these](https://stackoverflow.com/q/4350440/5325862) work for splitting your columns? — camille, Jan 06 '20 at 18:30

akrun · Answer 1 · 2020-01-06T21:41:22.123

One option is to split by ; with strsplit, convert to numeric and divide the first element by the second

dt[, new := unlist(lapply(strsplit(HH, ";"), 
        function(x) as.numeric(x[1])/as.numeric(x[2])))]

Or another option is reading the column as a data.table with fread and then do the division

dt[, new := fread(text = .SD[["HH"]], sep=";")[, V1/V2]]

For multiple columns, specify the columns in .SDcols, loop over the columns and do the same thing

dt[, paste0("new", 1:3) := lapply(.SD, function(x) 
         fread(text = x, sep=";")[, V1/V2]), .SDcols = HH:NLNL]
dt
#               variant_id           transcript_id        HH     HNL  NLNL     new1 new2 new3
#1: chr10_60842447_A_G_b38 chr10_60871326_60871443 32968;685 1440;20 337;1 48.12847 72.0  337
#2: chr10_60846892_G_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4  337
#3: chr10_60847284_C_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4  337
#4: chr10_60849980_T_C_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4  337
#5: chr10_60850566_A_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4  337
#6: chr10_60852394_C_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362 83.4  337

Or an option with tidyverse

library(dplyr)
library(tidyr)
dt %>%
    mutate(rn = row_number()) %>% 
    separate_rows(HH, HNL, NLNL, convert = TRUE) %>% 
    group_by(rn, variant_id, transcript_id) %>% 
    summarise_at(vars(HH:NLNL), ~ first(.)/last(.)) %>%
    ungroup %>% 
    select(-rn)
# A tibble: 6 x 5
#  variant_id             transcript_id              HH   HNL  NLNL
#  <chr>                  <chr>                   <dbl> <dbl> <dbl>
#1 chr10_60842447_A_G_b38 chr10_60871326_60871443  48.1  72     337
#2 chr10_60846892_G_A_b38 chr10_60871326_60871443  48.1  83.4   337
#3 chr10_60847284_C_T_b38 chr10_60871326_60871443  48.1  83.4   337
#4 chr10_60849980_T_C_b38 chr10_60871326_60871443  48.1  83.4   337
#5 chr10_60850566_A_T_b38 chr10_60871326_60871443  48.1  83.4   337
#6 chr10_60852394_C_A_b38 chr10_60871326_60871443  48.1  83.4   337

It can be also done in a compact way with

library(purrr)
library(dplyr)# v 0.8.99.9000
dt %>%
     mutate(across(HH:NLNL, ~ fread(text = ., sep=";") %>% 
                      reduce(`/`)))
#              variant_id           transcript_id       HH  HNL NLNL
#1 chr10_60842447_A_G_b38 chr10_60871326_60871443 48.12847 72.0  337
#2 chr10_60846892_G_A_b38 chr10_60871326_60871443 48.05362 83.4  337
#3 chr10_60847284_C_T_b38 chr10_60871326_60871443 48.05362 83.4  337
#4 chr10_60849980_T_C_b38 chr10_60871326_60871443 48.05362 83.4  337
#5 chr10_60850566_A_T_b38 chr10_60871326_60871443 48.05362 83.4  337
#6 chr10_60852394_C_A_b38 chr10_60871326_60871443 48.05362 83.4  337

data

dt <- structure(list(variant_id = c("chr10_60842447_A_G_b38", 
      "chr10_60846892_G_A_b38", 
"chr10_60847284_C_T_b38", "chr10_60849980_T_C_b38", "chr10_60850566_A_T_b38", 
"chr10_60852394_C_A_b38"), transcript_id = c("chr10_60871326_60871443", 
"chr10_60871326_60871443", "chr10_60871326_60871443", "chr10_60871326_60871443", 
"chr10_60871326_60871443", "chr10_60871326_60871443"), HH = c("32968;685", 
"33157;690", "33157;690", "33157;690", "33157;690", "33157;690"
), HNL = c("1440;20", "1251;15", "1251;15", "1251;15", "1251;15", 
"1251;15"), NLNL = c("337;1", "337;1", "337;1", "337;1", "337;1", 
"337;1")), class = c("data.table", "data.frame"), row.names = c(NA, 
-6L))

score 2 · Answer 2 · answered Jan 06 '20 at 18:29

One option involving dplyr and gsubfn could be:

df %>%
 mutate_at(vars(HH, HNL, NLNL), ~ gsubfn("([0-9]+);([0-9]+)", function(x, y) as.numeric(x)/as.numeric(y), .))

             variant_id           transcript_id               HH  HNL NLNL
1 chr10_60842447_A_G_b38 chr10_60871326_60871443 48.1284671532847   72  337
2 chr10_60846892_G_A_b38 chr10_60871326_60871443 48.0536231884058 83.4  337
3 chr10_60847284_C_T_b38 chr10_60871326_60871443 48.0536231884058 83.4  337
4 chr10_60849980_T_C_b38 chr10_60871326_60871443 48.0536231884058 83.4  337
5 chr10_60850566_A_T_b38 chr10_60871326_60871443 48.0536231884058 83.4  337
6 chr10_60852394_C_A_b38 chr10_60871326_60871443 48.0536231884058 83.4  33

Note that `function(x, y) as.numeric(x)/as.numeric(y)` could be written as `~as.numeric(x)/as.numeric(y)` — G. Grothendieck, Jan 08 '20 at 23:55

AndS. · Accepted Answer · 2020-01-06T20:08:10.377

Here is another option. We split the data in a nested cell, then map out the values.

library(tidyverse)

dt %>%
  mutate_at(vars(HH:NLNL), list(~str_split(., ";") )) %>%
  mutate_at(vars(HH:NLNL), list(~map_dbl(., ~as.numeric(.x) %>% {.[[1]]/.[[2]]}))) 

#>   variant_id             transcript_id              HH   HNL  NLNL
#>   <chr>                  <chr>                   <dbl> <dbl> <dbl>
#> 1 chr10_60842447_A_G_b38 chr10_60871326_60871443  48.1  72     337
#> 2 chr10_60846892_G_A_b38 chr10_60871326_60871443  48.1  83.4   337
#> 3 chr10_60847284_C_T_b38 chr10_60871326_60871443  48.1  83.4   337
#> 4 chr10_60849980_T_C_b38 chr10_60871326_60871443  48.1  83.4   337
#> 5 chr10_60850566_A_T_b38 chr10_60871326_60871443  48.1  83.4   337
#> 6 chr10_60852394_C_A_b38 chr10_60871326_60871443  48.1  83.4   337

EDIT: All you have to do to have new variables is to name them in the list:

dt %>%
  mutate_at(vars(HH:NLNL), list(new = ~str_split(., ";") )) %>%
  mutate_at(vars(HH_new:NLNL_new), list(~map_dbl(., ~as.numeric(.x) %>% {.[[1]]/.[[2]]})))

#>   variant_id       transcript_id      HH     HNL   NLNL  HH_new HNL_new NLNL_new
#>   <chr>            <chr>              <chr>  <chr> <chr>  <dbl>   <dbl>    <dbl>
#> 1 chr10_60842447_… chr10_60871326_60… 32968… 1440… 337;1   48.1    72        337
#> 2 chr10_60846892_… chr10_60871326_60… 33157… 1251… 337;1   48.1    83.4      337
#> 3 chr10_60847284_… chr10_60871326_60… 33157… 1251… 337;1   48.1    83.4      337
#> 4 chr10_60849980_… chr10_60871326_60… 33157… 1251… 337;1   48.1    83.4      337
#> 5 chr10_60850566_… chr10_60871326_60… 33157… 1251… 337;1   48.1    83.4      337
#> 6 chr10_60852394_… chr10_60871326_60… 33157… 1251… 337;1   48.1    83.4      337

This is great. Can you break it down a little bit for me? For example, what does the tilde do at `~str_split` and `~map_dbl` and `~as.numeric`? What about the dot before `.x` and `.[[1]]` etc? I'm guessing the dot as an argument references the first argument in `mutate_at`? — CelineDion, Jan 06 '20 at 20:23
Correct. I'll probably use the incorrect vocabulary or explain it half correctly, but the `.` is a reference to the variable in `vars` or the result of the previous pipe (i.e., `%>%`). The `.x` is a reference in `map` to the `.` which is a reference to the variable in `var`. That sounds kind of confusing, but I recommend reading at little on the `dplyr` `*_at()` and `*_all()` functions, as well as the `purrr` `map_*()` functions. — AndS., Jan 06 '20 at 20:28

ThomasIsCoding · Answer 4 · 2020-01-06T20:25:11.183

Here is a base R solution using strsplit to deal with semicolon separated values in a cell:

df <- cbind(df,`colnames<-`(sapply(c("HH","HNL","NLNL"),
                             function(v) sapply(strsplit(df[,v],split = ";"),
                                                function(x) Reduce("/",as.numeric(x)))),c("HHnew","HNLnew","NLNLnew")))
df$diff <- with(df,NLNLnew - HHnew)

such that

> df
              variant_id           transcript_id        HH     HNL  NLNL    HHnew HNLnew NLNLnew     diff
1 chr10_60842447_A_G_b38 chr10_60871326_60871443 32968;685 1440;20 337;1 48.12847   72.0     337 288.8715
2 chr10_60846892_G_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362   83.4     337 288.9464
3 chr10_60847284_C_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362   83.4     337 288.9464
4 chr10_60849980_T_C_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362   83.4     337 288.9464
5 chr10_60850566_A_T_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362   83.4     337 288.9464
6 chr10_60852394_C_A_b38 chr10_60871326_60871443 33157;690 1251;15 337;1 48.05362   83.4     337 288.9464

DATA

df <- structure(list(variant_id = c("chr10_60842447_A_G_b38", "chr10_60846892_G_A_b38", 
"chr10_60847284_C_T_b38", "chr10_60849980_T_C_b38", "chr10_60850566_A_T_b38", 
"chr10_60852394_C_A_b38"), transcript_id = c("chr10_60871326_60871443", 
"chr10_60871326_60871443", "chr10_60871326_60871443", "chr10_60871326_60871443", 
"chr10_60871326_60871443", "chr10_60871326_60871443"), HH = c("32968;685", 
"33157;690", "33157;690", "33157;690", "33157;690", "33157;690"
), HNL = c("1440;20", "1251;15", "1251;15", "1251;15", "1251;15", 
"1251;15"), NLNL = c("337;1", "337;1", "337;1", "337;1", "337;1", 
"337;1")), class = "data.frame", row.names = c(NA, -6L))

How do I calculate semi-colon separated values within a cell?

4 Answers4

data