You have an XML file (<<== is likely exactly the file you have, too). Note that said link is to an example file from the tmparallel
package and there are many places in that package that have code that works with it.
Work with XML as XML. Do not regex it.
xdf$places
in the following snippet has what you are looking for but since this is likely a file being used in class on text mining, you may eventually need all the other bits extracted into the data frame.
library(xml2)
library(tidyverse)
download.file(
"https://raw.githubusercontent.com/noahhl/tmparallel/master/pkg/inst/texts/reuters-21578.xml",
"~/Data/reuters-21578.xml"
)
reut <- read_xml("~/Data/reuters-21578.xml")
xml_find_all(reut, "//REUTERS") %>%
map_df(~{
xml_attrs(.x) %>%
as.list() %>%
as_data_frame() -> xdf
xdf$date <- xml_find_first(.x, ".//DATE") %>% xml_text(trim=TRUE)
#### NOTE THAT THIS FOLLOWING LINE IS THE DATA YOU ASKED FOR IN THE EXAMPLE
xdf$places <- list(xml_find_all(.x, ".//PLACES/D") %>% xml_text(trim=TRUE))
xdf$people <- list(xml_find_all(.x, ".//PEOPLE/D") %>% xml_text(trim=TRUE))
xdf$orgs <- list(xml_find_all(.x, ".//ORGS/D") %>% xml_text(trim=TRUE))
xdf$exchanges <- list(xml_find_all(.x, ".//EXCHANGES/D") %>% xml_text(trim=TRUE))
xdf$companies <- list(xml_find_all(.x, ".//COMPANIES/D") %>% xml_text(trim=TRUE))
xdf$uknown <- xml_find_first(.x, ".//UNKNOWN") %>% xml_text(trim=TRUE)
xdf$text_title <- xml_find_first(.x, ".//TEXT/TITLE") %>% xml_text(trim=TRUE)
xdf$text_dateline <- xml_find_first(.x, ".//TEXT/DATELINE") %>% xml_text(trim=TRUE)
xdf$text_body <- xml_find_first(.x, ".//TEXT/BODY") %>% xml_text(trim=TRUE)
xdf
}) -> text_df
Output:
text_df
## # A tibble: 10 x 15
## TOPICS LEWISSPLIT CGISPLIT OLDID NEWID date places people orgs
## <chr> <chr> <chr> <chr> <chr> <chr> <list> <list> <lis>
## 1 YES TRAIN TRAINING… 5544 1 26-FEB-1… <chr [… <chr [… <chr…
## 2 NO TRAIN TRAINING… 5545 2 26-FEB-1… <chr [… <chr [… <chr…
## 3 NO TRAIN TRAINING… 5546 3 26-FEB-1… <chr [… <chr [… <chr…
## 4 NO TRAIN TRAINING… 5547 4 26-FEB-1… <chr [… <chr [… <chr…
## 5 YES TRAIN TRAINING… 5548 5 26-FEB-1… <chr [… <chr [… <chr…
## 6 YES TRAIN TRAINING… 5549 6 26-FEB-1… <chr [… <chr [… <chr…
## 7 NO TRAIN TRAINING… 5550 7 26-FEB-1… <chr [… <chr [… <chr…
## 8 YES TRAIN TRAINING… 5551 8 26-FEB-1… <chr [… <chr [… <chr…
## 9 YES TRAIN TRAINING… 5552 9 26-FEB-1… <chr [… <chr [… <chr…
## 10 YES TRAIN TRAINING… 5553 10 26-FEB-1… <chr [… <chr [… <chr…
## # ... with 6 more variables: exchanges <list>, companies <list>,
## # uknown <chr>, text_title <chr>, text_dateline <chr>, text_body <chr>
glimpse(text_df)
## Observations: 10
## Variables: 15
## $ TOPICS <chr> "YES", "NO", "NO", "NO", "YES", "YES", "NO", "YE...
## $ LEWISSPLIT <chr> "TRAIN", "TRAIN", "TRAIN", "TRAIN", "TRAIN", "TR...
## $ CGISPLIT <chr> "TRAINING-SET", "TRAINING-SET", "TRAINING-SET", ...
## $ OLDID <chr> "5544", "5545", "5546", "5547", "5548", "5549", ...
## $ NEWID <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"
## $ date <chr> "26-FEB-1987 15:01:01.79", "26-FEB-1987 15:02:20...
## $ places <list> [<"el-salvador", "usa", "uruguay">, "usa", "usa...
## $ people <list> [<>, <>, <>, <>, <>, <>, <>, <>, <>, <>]
## $ orgs <list> [<>, <>, <>, <>, <>, <>, <>, <>, <>, <>]
## $ exchanges <list> [<>, <>, <>, <>, <>, <>, <>, <>, <>, <>]
## $ companies <list> [<>, <>, <>, <>, <>, <>, <>, <>, <>, <>]
## $ uknown <chr> "C T\nf0704reute\nu f BC-BAHIA-COCOA-REVIEW 02...
## $ text_title <chr> "BAHIA COCOA REVIEW", "STANDARD OIL <SRD> TO FOR...
## $ text_dateline <chr> "SALVADOR, Feb 26 -", "CLEVELAND, Feb 26 -", "HO...
## $ text_body <chr> "Showers continued throughout the week in\nthe B...
str(head(text_df, 2))
## Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 15 variables:
## $ TOPICS : chr "YES" "NO"
## $ LEWISSPLIT : chr "TRAIN" "TRAIN"
## $ CGISPLIT : chr "TRAINING-SET" "TRAINING-SET"
## $ OLDID : chr "5544" "5545"
## $ NEWID : chr "1" "2"
## $ date : chr "26-FEB-1987 15:01:01.79" "26-FEB-1987 15:02:20.00"
## $ places :List of 2
## ..$ : chr "el-salvador" "usa" "uruguay"
## ..$ : chr "usa"
## $ people :List of 2
## ..$ : chr
## ..$ : chr
## $ orgs :List of 2
## ..$ : chr
## ..$ : chr
## $ exchanges :List of 2
## ..$ : chr
## ..$ : chr
## $ companies :List of 2
## ..$ : chr
## ..$ : chr
## $ uknown : chr "C T\nf0704reute\nu f BC-BAHIA-COCOA-REVIEW 02-26 0105" "F Y\nf0708reute\nd f BC-STANDARD-OIL-<SRD>-TO 02-26 0082"
## $ text_title : chr "BAHIA COCOA REVIEW" "STANDARD OIL <SRD> TO FORM FINANCIAL UNIT"
## $ text_dateline: chr "SALVADOR, Feb 26 -" "CLEVELAND, Feb 26 -"
## $ text_body : chr "Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary an"| __truncated__ "Standard Oil Co and BP North America\nInc said they plan to form a venture to manage the money market\nborrowin"| __truncated__