split column into three columns based on the under score "_" in the levels

Question

I have a data.frame that I want to plot using ggplot2

df <- read.table(text = c("
scenario    value 
measured_raw21  55
measured_raw22  60
simulated_raw21 54
measured_raw21_drain    55
measured_raw22_drain    60
measured_adj21  23
simulated_raw22 59
simulated_raw21_drain   54.5
simulated_raw22_drain   60.2
measured_adj21_drain    23
measured_adj22  27
simulated_adj21 22
measured_adj22_drain    27
simulated_adj21_drain   23.4
simulated_adj22 27.5
simulated_adj22_drain   27.2
measured_0.5    40
measured_0.8    55
measured_0.5_drain  40
measured_0.8_drain  55
simulated_0.5   41
simulated_0.8   56
simulated_0.5_drain 39.9
simulated_0.8_drain 55.3"), header =T)

The scenario column has information about whether it is simulated or measured, the scenarioID (raw21, raw22) and whether it has drain or nodrain

I want to split the scenario column into 3 columns as below

                scenario       cat  scenario1   drain value
1         measured_raw21  measured      raw21 nodrain  55.0
2         measured_raw22  measured      raw22 nodrain  60.0
3        simulated_raw21 simulated      raw21 nodrain  54.0
4   measured_raw21_drain  measured      raw21   drain  55.0
5   measured_raw22_drain  measured      raw22   drain  60.0
6         measured_adj21  measured      adj21 nodrain  23.0
7        simulated_raw22 simulated      raw22 nodrain  59.0
8  simulated_raw21_drain simulated      raw21   drain  54.5
9  simulated_raw22_drain simulated      raw22   drain  60.2
10  measured_adj21_drain  measured      adj21   drain  23.0
11        measured_adj22  measured      adj22 nodrain  27.0
12       simulated_adj21 simulated      adj21 nodrain  22.0
13  measured_adj22_drain   meaured      adj22   drain  27.0
14 simulated_adj21_drain simulated      adj21   drain  23.4
15       simulated_adj22 simulated      adj22 nodrain  27.5
16 simulated_adj22_drain simulated      adj22   drain  27.2
17           meaured_0.5  measured        0.5 nodrain  40.0
18           meaured_0.8  measured        0.8 nodrain  55.0
19     meaured_0.5_drain  measured        0.5   drain  40.0
20     meaured_0.8_drain  measured        0.8   drain  55.0
21         simulated_0.5 simulated        0.5 nodrain  41.0
22         simulated_0.8 simulated        0.8 nodrain  56.0
23   simulated_0.5_drain simulated        0.5   drain  39.9
24   simulated_0.8_drain simulated        0.8   drain  55.3

I did it as below

df$cat <- c("measured", "measured", "simulated","measured", "measured", "measured","simulated", 
            "simulated", "simulated", "measured", "measured", "simulated",  "measured", "simulated", "simulated", "simulated", 
            "measured", "measured", "measured", "measured", "simulated",  "simulated", "simulated", "simulated")     

df$scenario1 <- c("raw21","raw22","raw21","raw21","raw22","adj21","raw22",'raw21',"raw22","adj21","adj22",
"adj21","adj22",'adj21',"adj22", 'adj22', "0.5", "0.8", "0.5", "0.8", "0.5", "0.8", "0.5", "0.8")

df$drain <- c("nodrain", "nodrain", "nodrain", "drain", "drain", "nodrain", "nodrain" ,"drain", 
"drain", "drain", "nodrain", "nodrain", "drain", "drain", "nodrain", "drain", "nodrain", "nodrain", 
"drain", "drain", "nodrain", "nodrain", "drain", "drain")

Here is the final plot that I want

library(tidyr)
library(dplyr)
library(ggplot2)
df_fin <-  df %>% 
  select(scenario1, drain, cat, value) %>% 
  spread(cat, value)


ggplot(df_fin, aes(measured, simulated, col = scenario1))+
  geom_point()+
  facet_wrap(~drain)

I splitted the scenario column in df into 3 columns manually. However, there is only 24 observations in the reproducible example but in my original data.frame, I have many observations which will make it time taking to split the scenario column the way I did above.

I will appreciate any help for an efficient way to split the scenario column into 3 columns?

@akrun This is not a duplicate of that other question. Here he wants to split a variable into three variables when some of the strings only contain two parts. — Mark Miller, Jan 13 '17 at 05:19
Here is a base `R` solution: `df$scenario <- ifelse(grepl("_drain", df$scenario) == FALSE, paste0(df$scenario, '_nodrain'), df$scenario); x = strsplit(df$scenario, "_", fixed = TRUE); do.call(rbind, x)` — Mark Miller, Jan 13 '17 at 06:03

score 3 · Accepted Answer · answered Jan 13 '17 at 04:12

You can use separate:

library(tidyr)
df %>% 
    separate(scenario, into = c("cat", "scenario1", "drain"), sep = "_", remove = FALSE) %>% 
    replace_na(list(drain = "nodrain"))        
    # if there is no drain, the cell will be <NA>, repalce it with nodrain

#                scenario       cat scenario1   drain value
#1         measured_raw21  measured     raw21 nodrain  55.0
#2         measured_raw22  measured     raw22 nodrain  60.0
#3        simulated_raw21 simulated     raw21 nodrain  54.0
#4   measured_raw21_drain  measured     raw21   drain  55.0
#5   measured_raw22_drain  measured     raw22   drain  60.0
#6         measured_adj21  measured     adj21 nodrain  23.0
#7        simulated_raw22 simulated     raw22 nodrain  59.0
#8  simulated_raw21_drain simulated     raw21   drain  54.5
#9  simulated_raw22_drain simulated     raw22   drain  60.2
#10  measured_adj21_drain  measured     adj21   drain  23.0
#11        measured_adj22  measured     adj22 nodrain  27.0
#12       simulated_adj21 simulated     adj21 nodrain  22.0
#13  measured_adj22_drain  measured     adj22   drain  27.0
# ...

your answer worked well with my large data.frame. However, I got this warning `Warning message: Too few values at 345 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ... ` Any idea why I'm getting it? — shiny, Jan 13 '17 at 05:09
It's a warning message which tells you not all strings can be split into three columns, if you want to suppress the warning message, you can specify `fill = "right"` in the separate function. — Psidom, Jan 13 '17 at 14:28

score 1 · Answer 2 · answered Jan 13 '17 at 15:03

Here is a possible solution in base R that I think will work provided _drain only appears at the end of the string. In that case I simply added _nodrain to strings that do not contain _drain then split on the _.:

df <- read.table(text = c("
scenario    value 
measured_raw21  55
measured_raw22  60
simulated_raw21 54
measured_raw21_drain    55
measured_raw22_drain    60
measured_adj21  23
simulated_raw22 59
simulated_raw21_drain   54.5
simulated_raw22_drain   60.2
measured_adj21_drain    23
measured_adj22  27
simulated_adj21 22
measured_adj22_drain    27
simulated_adj21_drain   23.4
simulated_adj22 27.5
simulated_adj22_drain   27.2
measured_0.5    40
measured_0.8    55
measured_0.5_drain  40
measured_0.8_drain  55
simulated_0.5   41
simulated_0.8   56
simulated_0.5_drain 39.9
simulated_0.8_drain 55.3"), header =T, stringsAsFactors = FALSE)

df$scenario <- ifelse(grepl("_drain", df$scenario) == FALSE,
               paste0(df$scenario, '_nodrain'), df$scenario)

x = strsplit(df$scenario, "_", fixed = TRUE)
do.call(rbind, x)

#      [,1]        [,2]    [,3]     
# [1,] "measured"  "raw21" "nodrain"
# [2,] "measured"  "raw22" "nodrain"
# [3,] "simulated" "raw21" "nodrain"
# [4,] "measured"  "raw21" "drain"  
# [5,] "measured"  "raw22" "drain"  
# [6,] "measured"  "adj21" "nodrain"
# [7,] "simulated" "raw22" "nodrain"
# [8,] "simulated" "raw21" "drain"  
# [9,] "simulated" "raw22" "drain"  
#[10,] "measured"  "adj21" "drain"  
#[11,] "measured"  "adj22" "nodrain"
#[12,] "simulated" "adj21" "nodrain"
#[13,] "measured"  "adj22" "drain"  
#[14,] "simulated" "adj21" "drain"  
#[15,] "simulated" "adj22" "nodrain"
#[16,] "simulated" "adj22" "drain"  
#[17,] "measured"  "0.5"   "nodrain"
#[18,] "measured"  "0.8"   "nodrain"
#[19,] "measured"  "0.5"   "drain"  
#[20,] "measured"  "0.8"   "drain"  
#[21,] "simulated" "0.5"   "nodrain"
#[22,] "simulated" "0.8"   "nodrain"
#[23,] "simulated" "0.5"   "drain"  
#[24,] "simulated" "0.8"   "drain"

split column into three columns based on the under score "_" in the levels

2 Answers2