I like tidyr::unnest
for this:
library(dplyr)
library(tidyr)
df %>% mutate(tokens = strsplit(as.character(names), split = " ")) %>%
unnest()
# names tokens
# 1 perform data cleansing perform
# 2 perform data cleansing data
# 3 perform data cleansing cleansing
# 4 information categorisation information
# 5 information categorisation categorisation
But you can also do it all in base
:
tokens = strsplit(as.character(df$names), split = " ")
result = data.frame(names = rep(df$names, lengths(tokens)),
tokens = unlist(tokens),
stringsAsFactors = FALSE)
# names tokens
# 1 perform data cleansing perform
# 2 perform data cleansing data
# 3 perform data cleansing cleansing
# 4 information categorisation information
# 5 information categorisation categorisation
A version that comes with extra features for text analysis is tidytext::unnest_tokens
:
df$names = as.character(df$names)
tidytext::unnest_tokens(df, output = tokens, input = names, drop = FALSE)
# names tokens
# 1 perform data cleansing perform
# 1.1 perform data cleansing data
# 1.2 perform data cleansing cleansing
# 2 information categorisation information
# 2.1 information categorisation categorisation