24

Does anybody know a function to convert a text representation of a number into an actual number, e.g. 'twenty thousand three hundred and five' into 20305. I have written numbers in dataframe rows and want to convert them to numbers.

In package qdap, you can replace numeric represented numbers with words (e.g., 1001 becomes one thousand one), but not the other way around:

library(qdap)
replace_number("I like 346457 ice cream cones.")
[1] "I like three hundred forty six thousand four hundred fifty seven ice cream cones."
Paul Hiemstra
  • 59,984
  • 12
  • 142
  • 149
Henk
  • 3,634
  • 5
  • 28
  • 54
  • @Henk I rewrote your question a bit to make it more clear that you need to convert words to number and not vice-versa. – Paul Hiemstra Aug 20 '13 at 10:58
  • 2
    I think the best thing to do is shoot the person who submitted a file with numbers written out as words. OK, seriously, I doubt there's any way to do this other than to write a rather detailed parsing algorithm that has a huge database of all number-words ('one', 'two',...'hundred','thousand,'...'googol') as well as some sort of tree-sorter for precedence. E.g., in your example, there are two "hundred"s, but they have different meanings based on the words which follow them in sequence. – Carl Witthoft Aug 20 '13 at 11:30

3 Answers3

21

Here's a start that should get you to hundreds of thousands.

word2num <- function(word){
    wsplit <- strsplit(tolower(word)," ")[[1]]
    one_digits <- list(zero=0, one=1, two=2, three=3, four=4, five=5,
                       six=6, seven=7, eight=8, nine=9)
    teens <- list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15,
                  sixteen=16, seventeen=17, eighteen=18, nineteen=19)
    ten_digits <- list(ten=10, twenty=20, thirty=30, forty=40, fifty=50,
                       sixty=60, seventy=70, eighty=80, ninety=90)
    doubles <- c(teens,ten_digits)
    out <- 0
    i <- 1
    while(i <= length(wsplit)){
        j <- 1
        if(i==1 && wsplit[i]=="hundred")
            temp <- 100
        else if(i==1 && wsplit[i]=="thousand")
            temp <- 1000
        else if(wsplit[i] %in% names(one_digits))
            temp <- as.numeric(one_digits[wsplit[i]])
        else if(wsplit[i] %in% names(teens))
            temp <- as.numeric(teens[wsplit[i]])
        else if(wsplit[i] %in% names(ten_digits))
            temp <- (as.numeric(ten_digits[wsplit[i]]))
        if(i < length(wsplit) && wsplit[i+1]=="hundred"){
            if(i>1 && wsplit[i-1] %in% c("hundred","thousand"))
                out <- out + 100*temp
            else
                out <- 100*(out + temp)
            j <- 2
        }
        else if(i < length(wsplit) && wsplit[i+1]=="thousand"){
            if(i>1 && wsplit[i-1] %in% c("hundred","thousand"))
                out <- out + 1000*temp
            else
                out <- 1000*(out + temp)
            j <- 2
        }
        else if(i < length(wsplit) && wsplit[i+1] %in% names(doubles)){
            temp <- temp*100
            out <- out + temp
        }
        else{
            out <- out + temp
        }
        i <- i + j
    }
    return(list(word,out))
}

Results:

> word2num("fifty seven")
[[1]]
[1] "fifty seven"

[[2]]
[1] 57

> word2num("four fifty seven")
[[1]]
[1] "four fifty seven"

[[2]]
[1] 457

> word2num("six thousand four fifty seven")
[[1]]
[1] "six thousand four fifty seven"

[[2]]
[1] 6457

> word2num("forty six thousand four fifty seven")
[[1]]
[1] "forty six thousand four fifty seven"

[[2]]
[1] 46457

> word2num("forty six thousand four hundred fifty seven")
[[1]]
[1] "forty six thousand four hundred fifty seven"

[[2]]
[1] 46457

> word2num("three forty six thousand four hundred fifty seven")
[[1]]
[1] "three forty six thousand four hundred fifty seven"

[[2]]
[1] 346457

I can tell you already that this won't work for word2num("four hundred thousand fifty"), because it doesn't know how to handle consecutive "hundred" and "thousand" terms, but the algorithm can be modified probably. Anyone should feel free to edit this if they have improvements or build on them in their own answer. I just thought this was a fun problem to play with (for a little while).

Edit: Apparently Bill Venables has a package called english that may achieve this even better than the above code.

Thomas
  • 43,637
  • 12
  • 109
  • 140
3

I wrote an R package to do this a few years back, https://github.com/fsingletonthorn/words_to_numbers, which works for numbers up to the decillions.

devtools::install_github("fsingletonthorn/words_to_numbers")

library(wordstonumbers)

example_input <- "twenty thousand three hundred and five"

words_to_numbers(example_input)

[1] "20305"

It also works for much more complex cases similar to those included in the qdap example:

words_to_numbers('I like three hundred forty six thousand four hundred fifty seven ice cream cones.')
[1] "I like 346457 ice cream cones."
FelixST
  • 303
  • 2
  • 8
  • When I try to install the package, I'm getting an error that says "namespace ‘rlang’ 1.0.2 is already loaded, but >= 1.0.3 is required." I already tried uninstalling and re-installing rlang, but am still getting the same error. Any idea what I should do? – Rasputin Nov 03 '22 at 22:46
  • 1
    @Rasputin ~ I can't reproduce your error, but it sounds like you aren't successfully updating rlang to v. >=1.0.3. I would try closing down and reopening a fresh R session (i.e., without any objects in memory), and then `install.packages("rlang")`, check that it updates properly, restart R again after rlang >1.0.3 has been installed, and then see if you still get the error on installation of words_to_numbers. – FelixST Dec 11 '22 at 22:54
-2

Here's what I think is a better solution.

    library(stringdist)
    library(gdata)
    #Convert numeric words to digits
isNumericWord=function(string, dist=1, method="dl"){
  nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
         "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
         "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
         "hundred","thousand","million","billion","trillion")
  return(any(stringdist(tolower(string),nums,method=method)<=dist))
}
numberTypes=function(string, dist=1, method="dl"){
  nums=c("zero","one","two","three","four","five","six","seven","eight","nine",
         "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen",
         "twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety",
         "hundred","thousand","million","billion","trillion")
  string=gsub("[[:punct:]]"," ",string)
  wrdsplit=strsplit(string,split=" ")[[1]]
  wrdsplit=wrdsplit[wrdsplit!=""]
  #Handle number types
  wrdsplit=ifelse(stringdist("first",tolower(wrdsplit),method=method)<=dist,"one st",wrdsplit)
  wrdsplit=ifelse(stringdist("second",tolower(wrdsplit),method=method)<=dist,"two nd",wrdsplit)
  wrdsplit=ifelse(stringdist("third",tolower(wrdsplit),method=method)<=dist &
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","three rd",wrdsplit)
  wrdsplit=ifelse(stringdist("fourth",tolower(wrdsplit),method=method)<=dist & 
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","four th",wrdsplit)
  wrdsplit=ifelse(stringdist("fifth",tolower(wrdsplit),method=method)<=dist & 
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","five th",wrdsplit)
  wrdsplit=ifelse(stringdist("sixth",tolower(wrdsplit),method=method)<=dist & 
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","six th",wrdsplit)
  wrdsplit=ifelse(stringdist("seventh",tolower(wrdsplit),method=method)<=dist &
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","seven th",wrdsplit)
  wrdsplit=ifelse(stringdist("eighth",tolower(wrdsplit),method=method)<=dist &
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","eight th",wrdsplit)
  wrdsplit=ifelse(stringdist("ninth",tolower(wrdsplit),method=method)<=dist &
                    tolower(substr(wrdsplit,nchar(wrdsplit),nchar(wrdsplit)))!="y","nine th",wrdsplit)
  wrdsplit=ifelse(stringdist("tenth",tolower(wrdsplit),method=method)<=dist,"ten th",wrdsplit)
  wrdsplit=ifelse(stringdist("twentieth",tolower(wrdsplit),method=method)<=dist,"twenty th",wrdsplit)
  wrdsplit=ifelse(stringdist("thirtieth",tolower(wrdsplit),method=method)<=dist,"thirty th",wrdsplit)
  wrdsplit=ifelse(stringdist("fortieth",tolower(wrdsplit),method=method)<=dist,"forty th",wrdsplit)
  wrdsplit=ifelse(stringdist("fiftieth",tolower(wrdsplit),method=method)<=dist,"fifty th",wrdsplit)
  wrdsplit=ifelse(stringdist("sixtieth",tolower(wrdsplit),method=method)<=dist,"sixty th",wrdsplit)
  wrdsplit=ifelse(stringdist("seventieth",tolower(wrdsplit),method=method)<=dist,"seventy th",wrdsplit)
  wrdsplit=ifelse(stringdist("eightieth",tolower(wrdsplit),method=method)<=dist,"eighty th",wrdsplit)
  wrdsplit=ifelse(stringdist("ninetieth",tolower(wrdsplit),method=method)<=dist,"ninety th",wrdsplit)
  #Handle other number words that end in "th"
  if(length(wrdsplit)>0){
    for(i in 1:length(wrdsplit)){
      substr_end=substr(wrdsplit[i],(nchar(wrdsplit[i])-1),nchar(wrdsplit[i]))
      substr_beg=substr(wrdsplit[i],1,(nchar(wrdsplit[i])-2))
      if(substr_end=="th" & nchar(wrdsplit[i])!=2 & any(stringdist(tolower(substr_beg),nums,method=method)<=dist)){
        wrdsplit[i]=paste(substr_beg, substr_end,sep=" ")
      }
    }
    return(gsub("  "," ",paste(wrdsplit,collapse=" ")))
  }else{
    return("")
  }
}

#Convert number words to digits
Word2Num=function(string, dist=1, method="dl"){
  original=string
  #Define numbers
  one_digits = list(zero=0, one=1, two=2, three=3, four=4, five=5,
                    six=6, seven=7, eight=8, nine=9)
  teens = list(eleven=11, twelve=12, thirteen=13, fourteen=14, fifteen=15,
               sixteen=16, seventeen=17, eighteen=18, nineteen=19)
  ten_digits = list(ten=10, twenty=20, thirty=30, forty=40, fifty=50,
                    sixty=60, seventy=70, eighty=80, ninety=90)
  large_digits = list(hundred=100, thousand=1000, million=1e6, billion=1e9, trillion=1e12)
  double_digits = c(teens,ten_digits)

  #Split the string into words
  string=gsub("-"," ",gsub(" & ", " and ",string,ignore.case=T))
  string=numberTypes(string)
  wrdsplit=strsplit(tolower(string)," ")[[1]]
  wrdsplit=wrdsplit[wrdsplit!=""]
  isNumber=apply(data.frame(wrdsplit),1,isNumericWord)

  #Find groups of numbers
  if(exists("groups")){
    suppressWarnings(rm(groups))
  }
  i=1
  while(i <= length(wrdsplit)){
    if(isNumber[i]==T){
      if(!exists("groups")){
        groups=list(wrdsplit[i])
      }else if(exists("groups")){
        groups=c(groups, wrdsplit[i])
      }
      for(j in (i+1):length(wrdsplit)){
        if(isNumber[j]){
          groups[[length(groups)]]=c(groups[[length(groups)]],wrdsplit[j])
          i=j+1
        }else{
          i=i+1
          break
        }
      }
    }else{
      i=i+1
    }
  }

  #Convert numeric words to numbers
  if(exists("groups")){
    groupNums=groups
    for(j in 1:length(groups)){
      for(i in 1:length(groups[[j]])){
        #If word is a single digit number
        if(any(stringdist(groups[[j]][i],names(one_digits),method=method)<=dist & 
               tolower(substr(groups[[j]][i],nchar(groups[[j]][i]),nchar(groups[[j]][i])))!="y")){
          #If word is a single digit number
          groupNums[[j]][i]=one_digits[stringdist(groups[[j]][i],names(one_digits),method=method)<=dist][[1]]
        }else if(any(stringdist(groups[[j]][i],names(double_digits),method=method)<=dist)){
          #If word is a double digit number
          groupNums[[j]][i]=double_digits[stringdist(groups[[j]][i],names(double_digits),method=method)<=dist][[1]]
        }else if(any(stringdist(groups[[j]][i],names(large_digits),method=method)<=dist)){
          #If word is a large digit number
          groupNums[[j]][i]=large_digits[stringdist(groups[[j]][i],names(large_digits),method=method)<=dist][[1]]
        }
      }
    }

    #Convert the separated numbers to a single number
    defscipen=options("scipen")[[1]]
    options(scipen=999)
    for(i in 1:length(groups)){
      if(length(groupNums[[i]])==1){
        groupNums[[i]]=as.numeric(groupNums[[i]][1])
      }else{
        while(length(groupNums[[i]])>=2){
          if(nchar(groupNums[[i]][2])>nchar(groupNums[[i]][1])){
            #If the next word has more digits than the current word, multiply them
            temp=as.numeric(groupNums[[i]][1])*as.numeric(groupNums[[i]][2])
          }else if(nchar(groupNums[[i]][2])<nchar(groupNums[[i]][1])){
            #if the next word has less digits than the current word, add them
            temp=as.numeric(groupNums[[i]][1])+as.numeric(groupNums[[i]][2])
          }
          #Combine the results
          if(length(groupNums[[i]])>2){
            groupNums[[i]]=c(temp, groupNums[[i]][3:length(groupNums[[i]])])
          }else{
            groupNums[[i]]=temp
          }
        }
      }
    }
    #Recreate the original string
    groupNums=lapply(groupNums, as.character)
    options(scipen=defscipen)
    for(i in 1:length(groups)){
      wrdsplit[which(wrdsplit==groups[[i]][1])]=groupNums[[i]][1]
      if(length(groups[[i]]>1)){
        wrdsplit[which(wrdsplit==groups[[i]][2:length(groups)])]=""
      }
    }
    #Combine numbers with their endings
    wrdsplit=wrdsplit[wrdsplit!=""]
    if(any(wrdsplit[which(wrdsplit %in% unlist(groupNums))+1] %in% c("rd","th","st","nd"))){
      locs=which(wrdsplit %in% unlist(groupNums))
      for(i in length(locs):1){
        wrdsplit[locs[i]]=paste(wrdsplit[c(locs[i],(locs[i]+1))],collapse="")
        wrdsplit=wrdsplit[-(locs[i]+1)]
      }
    }
    return(trim(paste(wrdsplit,collapse=" ")))
  }else{
    return(original)
  }
}
ajhubb
  • 1
  • 1
  • This code doesn't work unfortunately. Here are some tests (after running it): > isNumericWord("one hundred") [1] FALSE > Word2Num("one hundred") Error in groups[[j]][i] : object of type 'closure' is not subsettable > isNumericWord("100") [1] FALSE > Word2Num("five thousand") Error in groups[[j]][i] : object of type 'closure' is not subsettable – Rasputin Nov 03 '22 at 22:51