2

I have a data.frame like this:

names
---------------
car
truck or lorry
bike

I need this output, spliting each string, using " or " in this example:

names          |norm
---------------|-------
car            |car
truck or lorry |truck
truck or lorry |lorry    <-- "truck or lorry" has to be repeated
bike           |bike

My attempt was to use strsplit, but I can't repeat the first column value to get the result

df <- data.frame(names=c("car","truck or lorry","bike"),stringsAsFactors=FALSE)

df$norm <- strsplit(df$names, " or ")
Emeeus
  • 5,072
  • 2
  • 25
  • 37

2 Answers2

3

We can use separate_rows

library(dplyr)
library(tidyr)
df %>% 
  mutate(norm = names) %>%
  separate_rows(norm, sep="\\sor\\s")

Or in base R with stack/strsplit

stack(setNames(strsplit(df$names, " or "), df$names))[2:1]
#            ind values
#1            car    car
#2 truck or lorry  truck
#3 truck or lorry  lorry
#4           bike   bike
akrun
  • 874,273
  • 37
  • 540
  • 662
1

using base R

names <- c("car","truck or lorry","bike")

list <- strsplit(names, "\\sor\\s")

stitch <- function(x) {
  names <- names[x]
  norm <- unlist(list[x])
  cbind(names,norm)
}

stitched_data <- lapply(seq_along(list), stitch)

df <- do.call(rbind,stitched_data)

just wanted to test the speed so i used the code from dplyr example

dplyr 16.09784 secs
apply 5.410267 secs

speed test code

#dummy data
names <- as.factor(c("car","truck or lorry","bike"))
test <- sample(1:3, 1000000, replace = T)
names <- levels(names)[test]
list <- strsplit(names, "\\sor\\s")
x <- as.data.frame(names)

#lapply
start_time <- Sys.time()
stitch <- function(x) {
  names <- names[x]
  norm <- unlist(list[x])
  cbind(names,norm)
}

stitched_data <- lapply(seq_along(list), stitch)
df <- do.call(rbind,stitched_data)
end_time <- Sys.time()

#dplyr
library(dplyr)
library(tidyr)

start_time1 <- Sys.time()
x %>% 
  mutate(norm = names) %>%
  separate_rows(norm, sep="\\sor\\s")
end_time1 <- Sys.time()

#time
paste("lapply: ",end_time - start_time)
paste("dplyr: ",end_time1 - start_time1)