Here's an approach using the tidyverse
and purrr
:
library(tidyverse)
useful <- c(" Busti
169 425 Total 2,786 5,259 Franklin
256 410", "Total 1,399 2,915 Arkwright 154 320 Smyrna 179 319 Deposit 110 169")
map(useful, str_squish) %>%
str_split("\\s+")
# [[1]]
# [1] "Busti" "169" "425" "Total" "2,786" "5,259" "Franklin" "256" "410"
#
# [[2]]
# [1] "Total" "1,399" "2,915" "Arkwright" "154" "320" "Smyrna" "179" "319" "Deposit" "110"
# [12] "169"
Alternatively:
map(useful, str_squish) %>%
str_split("\\s+(?=[[:alpha:]])")
# [[1]]
# [1] "Busti 169 425" "Total 2,786 5,259" "Franklin 256 410"
#
# [[2]]
# [1] "Total 1,399 2,915" "Arkwright 154 320" "Smyrna 179 319" "Deposit 110 169"
And then you may want to consider...
map(useful, str_squish) %>%
str_split("\\s+(?=[[:alpha:]])") %>%
enframe %>%
unnest
# # A tibble: 7 x 2
# name value
# <int> <chr>
# 1 1 Busti 169 425
# 2 1 Total 2,786 5,259
# 3 1 Franklin 256 410
# 4 2 Total 1,399 2,915
# 5 2 Arkwright 154 320
# 6 2 Smyrna 179 319
# 7 2 Deposit 110 169
Or even...
map(useful, str_squish) %>%
str_split("\\s+(?=[[:alpha:]])") %>%
enframe %>%
unnest %>%
separate(value, c("Group", "Item1", "Item2"), sep = "\\s") %>%
mutate_at(vars(starts_with("Item")), ~ str_replace(., ",", "") %>% as.numeric)
# # A tibble: 7 x 4
# name Group Item1 Item2
# <int> <chr> <dbl> <dbl>
# 1 1 Busti 169 425
# 2 1 Total 2786 5259
# 3 1 Franklin 256 410
# 4 2 Total 1399 2915
# 5 2 Arkwright 154 320
# 6 2 Smyrna 179 319
# 7 2 Deposit 110 169
And finally, if the number of "items" is unknown or of varying length, you'll want to do something like the following and/or reference this question:
map(useful, str_squish) %>%
str_split("\\s+(?=[[:alpha:]])") %>%
enframe %>%
unnest %>%
mutate(to_sep = str_split(value, "\\s")) %>%
unnest(to_sep) %>%
group_by(value) %>%
mutate(row = row_number()) %>%
spread(row, to_sep)
# # A tibble: 7 x 5
# # Groups: value [7]
# name value `1` `2` `3`
# <int> <chr> <chr> <chr> <chr>
# 1 1 Busti 169 425 Busti 169 425
# 2 1 Franklin 256 410 Franklin 256 410
# 3 1 Total 2,786 5,259 Total 2,786 5,259
# 4 2 Arkwright 154 320 Arkwright 154 320
# 5 2 Deposit 110 169 Deposit 110 169
# 6 2 Smyrna 179 319 Smyrna 179 319
# 7 2 Total 1,399 2,915 Total 1,399 2,915
You may want to consider breaking this off into a more specific question, especially now that you are providing the pdf and ask more directly what you are trying to achieve. That being said, I'm not sure the blanks are relevant here, as you could use the following pipeline.
library(pdftools)
library(tidyverse)
text <- pdf_text("https://www.dec.ny.gov/docs/wildlife_pdf/09deerrpt.pdf")
clean_text <-
text %>%
str_squish() %>%
magrittr::extract(., 14:17) %>%
paste(collapse = " ") %>%
# First get rid of the header text
str_remove("New York State Department of Environmental.*TOTAL TAKE. ") %>%
# Now get rid of Page numbers, e.g., Page 14, Page 15
str_remove_all("Page [[:digit:]]{2}") %>%
# Get rid of the COUNTY labels since they're not going to line up anyway...
str_remove_all("[A-Z]{2,}") %>%
# Remove Totals since they won't line up...
str_remove("Statewide Totals.*") %>%
# Remove commas from numbers
str_remove_all(",") %>%
# Another squish for good measure and for some less than perfect removals above
str_squish()
clean_text %>%
# Remove the individual total lines
str_remove_all("Total\\s\\w+\\s\\w+") %>%
str_squish() %>%
str_extract_all("[A-Za-z ]+\\s\\d+\\s\\d+") %>%
unlist %>%
str_squish() %>%
data_frame(by_line = .) %>%
extract(
by_line, c("location", "adult_take", "total_take"), regex = "([A-Za-z ]+\\s?)(\\d+\\s?)(\\d+\\s?)"
) %>%
mutate(
location = str_squish(location),
adult_take = str_squish(adult_take) %>% as.numeric,
total_take = str_squish(total_take) %>% as.numeric
)
# # A tibble: 943 x 3
# location adult_take total_take
# <chr> <dbl> <dbl>
# 1 Carroll 103 215
# 2 Albany City 24 41
# 3 Allegany 115 231
# 4 Charlotte 116 248
# 5 Altona 50 87
# 6 Berne 163 292
# 7 Ashford 338 721
# 8 Chautauqua 242 613
# 9 Ausable 18 21
# 10 Bethlehem 141 280
# # ... with 933 more rows