Here is an attempt. It's quite likely I've not thought of some words or phrases, but it gets the right answer on the question asker's input:
number_word_remover <- function(phrase) {
out = c()
# words we know for sure are numbers
number_words <- c("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen",
"eighteen", "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", "seventy",
"eighty", "ninety", "hundred", "thousand", "million", "billion", "trillion", "half",
"quarter", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth",
"eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth",
"seventeenth", "eighteenth", "nineteenth", "twentieth", "thirtieth", "fortieth",
"fiftieth", "sixtieth", "seventieth", "eightieth", "ninetieth", "hundredth",
"thousandth", "millionth", "billionth", "trillionth", "ones", "twos", "threes", "fours",
"fives", "sixes", "sevens", "eights", "nines", "tens", "elevens", "twelves", "thirteens",
"fourteens", "fifteens", "sixteens", "seventeens", "eighteens", "nineteens", "twenties",
"thirties", "forties", "fifties", "sixties", "seventies", "eighties", "nineties",
"hundreds", "thousands", "millions", "billions", "trillions", "halves", "quarters",
"thirds", "fourths", "fifths", "sixths", "sevenths", "eighths", "ninths", "tenths",
"elevenths", "twelfths", "thirteenths", "fourteenths", "fifteenths", "sixteenths",
"seventeenths", "eighteenths", "nineteenths", "twentieths", "thirtieths", "fortieths",
"fiftieths", "sixtieths", "seventieths", "eightieths", "ninetieths", "hundredths",
"thousandths", "millionths", "billionths", "trillionths", "zeroes", "nought", "naught", "nil", "fourty")
# words we think are probably numbers. If in doubt, check the next number. If that word is in the number or possible number word list, then we flag it as a number word
possible_number_words <- c("minus", "and", "point")
phrase <- str_split(phrase, " ")[[1]]
for (i in seq_along(phrase)) {
good = F
# remove punctuation from word
cleaned_word <- gsub("[[:punct:]]", "", phrase[i])
if (cleaned_word %in% possible_number_words) {
next_cleaned_word <- gsub("[[:punct:]]", "", phrase[i+1])
if (!(next_cleaned_word %in% number_words | next_cleaned_word %in% possible_number_words)) {
good = T
}
} else if (!(cleaned_word %in% number_words)) {
good = T
}
if (good) {
out <- c(out, phrase[i])
} else if (substr(phrase[i], nchar(phrase[i]), nchar(phrase[i])) == ".") {
# put a period on the last word
out[length(out)] <- paste0(out[length(out)], ".")
}
}
return(paste(out, collapse = " "))
}
example <- "This is an example text which contains some numbers written in words such as one, two, three. The text one continues to text two and text three. I also have five hundred fifty dollars. But I am looking for five hundred thousands three hundred fourty seven more to invest into some stocks."
number_word_remover(example)
[1] "This is an example text which contains some numbers written in words such as. The text continues to text and text. I also have dollars. But I am looking for more to invest into some stocks."