2

I've built a spell check function for a sample of 1000 rows to ensure its efficiency, using the 'hunspell' package and the Australian English dictionary. The spell-checker ignores abbreviations. My actual data has close to 2 million lines, I therefore need to convert the 'for' loops into the 'apply' family functions.

I'm almost there I, but the last part isn't working. Below are the original for loop functions:

for(i in 1:nrow(data_words))
{
  print(i)
  
  string1 <- data_words$title[i]
  string2 <- ""
  
  for(j in 1:sapply(strsplit(string1, " "), length))
  {
    w <- word(string1, j)
    
    # if word is not an abbreviation
    if (!isAbbreviation(w))
    {
      # correct word
      w <- correctText(w)
    }
    
    string2 <- paste0(string2, w, sep = " ")
    
    # add word in new column 'spell_check'
    data_words$spell_check[i] <- string2
    
  }
}

isAbbreviation <- function(x)
{
  abb = FALSE
  
  # all capitalised letters
  if(str_detect(x, "^[:upper:]+$"))
  {
    abb = TRUE
  }
  
  # dealing with abbs that end in an 's'
  b = str_extract_all(x, "(\\b[A-Z]+\\b)|\\b[A-Z]+s+\\b")
  list_empty = rlang::is_empty(unlist(b))
  
  if(!list_empty)
  {
    abb = TRUE
  }
  return(abb)
}

correctText = function(x)
{
  sapply(1:length(x), function(y)
  {
    # get misspelled words
    bad_words = hunspell(x[y], dict = "en_AU")[[1]]
    
    # if list of misspelled words is not empty
    if(length(bad_words))
    {
      for (i in 1:length(bad_words))
      {
        list_empty = rlang::is_empty(unlist(hunspell_suggest(bad_words[i], 
                                                             dict = "en_AU")))
        # if suggestion list is not empty
        if(!list_empty)
        {
          # correct word
          good = unlist(lapply(hunspell_suggest(bad_words[i], dict = "en_AU"), `[[`, 1))
        }
        else
        {
          # else leave word is it is
          good = bad_words[i]
        }
        # replace mispelled words with corrected ones
        x[y] <<- gsub(bad_words[i], good, x[y])
      }
    }
  })
  x
}

Reproducible sample of phrases to be corrected:

library(dplyr)
library(stringr)
library(hunspell)
library(textclean)

sample <- 
  c("Paaediatrics AsseSssing Febrile Infant Child", "Manuual Handling Traain Trainer", "Catheterise CTHs", "Labelinsfbsbinsajectables", "Mentouring", "techhnical", "Basic Life Support BSL", "BloodSafe cliniiical transfusion practice", "Astthma", "Zika virus preegnancy update")

data_words <- data.frame(matrix(nrow = length(sample), ncol = 1))
names(data_words) <- "title"
data_words$title <- sample
data_words <- as_tibble(data_words)

I had a go at it, please refer to the below functions:

# the abbreviation function remains the same

# function to correct a misspelled word
correctTheWord <- function(bad_word)
{
  # print(bad_word)
  
  if (!isAbbreviation(bad_word))
  {
    list_empty = rlang::is_empty(unlist(hunspell_suggest(bad_word,
                                                         dict = "en_AU")))
    
    if (!list_empty)
    {
      good = unlist(
        lapply(hunspell_suggest(bad_word, dict = "en_AU"),
               `[[`,
               1
        ))
    }
    else
    {
      good = bad_word
    }
  }
  
  else
  {
    good = bad_word
  }
}

# correct whole row function
correctText = function(x)
{
  sapply(1:length(x), function(y)
  {
    bad = hunspell(x[y], dict = "en_AU")[[1]]
    
    if (length(bad))
    {
      return(mgsub(x, bad, lapply(bad, correctTheWord)))
    }
    else
    {
      return(x)
    }
  })
}


# testing the first 2 titles
correctText("Paaediatrics AsseSssing Febrile Infant Child")
correctText("Manuual Handling Traain Trainer")


# this is not working 
data_words$spell_check <- 
  apply(data_words[, 1], 2,  correctText)

Also, can my functions can be simplified further?

Yeshyyy
  • 669
  • 6
  • 21

1 Answers1

2

This will identify and replace incorrectly spelt words with the correct spelling. Note that it will ignore abbreviations as desired, and it assumes all words are separated by a space.


# First, define isAbbreviation

isAbbreviation <- function(x)
{
  abb = FALSE
  
  # all capitalised letters
  if(str_detect(x, "^[:upper:]+$"))
  {
    abb = TRUE
  }
  
  # dealing with abbs that end in an 's'
  b = str_extract_all(x, "(\\b[A-Z]+\\b)|\\b[A-Z]+s+\\b")
  list_empty = rlang::is_empty(unlist(b))
  
  if(!list_empty)
  {
    abb = TRUE
  }
  return(abb)
}


sample <- 
  c("Paaediatrics AsseSssing Febrile Infant Child", "Manuual Handling Traain Trainer", 
    "Catheterise CTHs", "Labelinsfbsbinsajectables", "Mentouring", "techhnical", 
    "Basic Life Support BSL", "BloodSafe cliniiical transfusion practice", "Astthma", 
    "Zika virus preegnancy update", "Basic Labelinsfbsbinsajectables technical")

data_words <- data.frame(matrix(nrow = length(sample), ncol = 1))
names(data_words) <- "title"
data_words$title <- sample
data_words <- as_tibble(data_words)


correct_spelling <- function(text) {
  
  words <- text %>% 
  str_split(" ") %>% 
  .[[1]]

  abbreviation <- words %>% sapply(isAbbreviation) %>% 
    unname
  
  # Abbreviations return false here, which is inconsequential since we don't replace them 
  correct <- words %>% 
    sapply(function(x) {hunspell_check(x, dict = dictionary("en_AU")) } ) %>% 
    unname
  
  # Correct the word if incorrect and not abbreviation
  if(!any(!(!abbreviation) & (!correct))) {
  
    misspelled_and_not_abbreviation <- words[(!abbreviation) & (!correct)] 
  
  
    suggestions <- misspelled_and_not_abbreviation %>% 
      hunspell_suggest(dict = dictionary("en_AU")) 
    
    suggested_words <- sapply(seq_along(suggestions), function(y, i) 
      { ifelse(length(y[[1]]) == 0, misspelled_and_not_abbreviation[i], y[[i]][1]) }, 
      y=suggestions)

    words[as.logical((!abbreviation) * (!correct))] <- suggested_words 
  
  }
  
  words %>% paste0(collapse = " ")

  
}

data_words$spell_check2 <- data_words$title %>% sapply(correct_spelling) %>% unname

which gives

data_words

#    title                                        spell_check2                              
#    <chr>                                        <chr>                                     
#  1 Paaediatrics AsseSssing Febrile Infant Child Paediatrics Assessing Febrile Infant Child
#  2 Manuual Handling Traain Trainer              Manual Handling Train Trainer             
#  3 Catheterise CTHs                             Catheterise CTHs                          
#  4 Labelinsfbsbinsajectables                    Labelinsfbsbinsajectables                 
#  5 Mentouring                                   Mentoring                                 
#  6 techhnical                                   technical                                 
#  7 Basic Life Support BSL                       Basic Life Support BSL                    
#  8 BloodSafe cliniiical transfusion practice    Blood Safe clinical transfusion practice  
#  9 Astthma                                      Asthma                                    
# 10 Zika virus preegnancy update                 Erika virus pregnancy update              
# 11 Basic Labelinsfbsbinsajectables technical    Basic Labelinsfbsbinsajectables technical

stevec
  • 41,291
  • 27
  • 223
  • 311
  • 1
    Thank you! I think you skipped the part where, if hunspell suggestion list is empty for a misspelled word, skip correcting that word. Would you mind editing it please so I can accept your answer? – Yeshyyy Jul 08 '20 at 06:02
  • 1
    @Yeshyyy I will take a look. Do you have example input that I can test on? – stevec Jul 08 '20 at 06:03
  • 1
    I edited the sample. The 4th row word doesn't have a suggestion list. It should prompt an error there. – Yeshyyy Jul 08 '20 at 06:08
  • 1
    @Yeshyyy if the input was multiple words and hunspell had no suggestion for one of them, do you want the good words returned? E.g. "Basic Labelinsfbsbinsajectables technical" would return "Basic techincal" - is this correct? – stevec Jul 08 '20 at 06:10
  • 1
    Sorry forgot to precise that. I want the misspelled word returned too, therefore "Basic Labelinsfbsbinsajectables technical". If 'Basic' or 'technical' are mispelled, they should be corrected. E.g "Basic Labelinsfbsbinsajectables techhnical" returns "Basic Labelinsfbsbinsajectables technical" – Yeshyyy Jul 08 '20 at 06:13
  • 1
    @Yeshyyy I updated (using a technique [here](https://stackoverflow.com/questions/9950144/access-lapply-index-names-inside-fun) to access the index inside s/lapply) – stevec Jul 08 '20 at 12:10
  • thank you, did you say you've put the index counter in the function? Thanks for providing the link, I'll try adding it on my own. – Yeshyyy Jul 09 '20 at 10:59
  • 1
    @Yeshyyy yep, this `sapply(seq_along(suggestions), function(y, i) etc` allows us to access the index inside the `sapply` (index is `i`). I got curious to see if there's a better way, and it turns out there is. See [here](https://stackoverflow.com/a/62795861/5783745). I will use that way in the future! – stevec Jul 09 '20 at 11:02
  • @stevec, can you edit your code to include `isAbbreviation()` custom function? I have similar situation but `isAbbreviation()` in the original function is not clear. Thanks. – William Oct 09 '20 at 10:05
  • @William no problems. Done – stevec Oct 09 '20 at 10:09
  • Hi @stevec, many thanks. However, there is an error message: `Error in if (str_detect(x, "^[:upper:]+$")) { : missing value where TRUE/FALSE needed`, due to NA. What I did was to remove `NA`, and it was fine, but that is not what I want. But, is it possible for `if (str_detect(x, "^[:upper:]+$"))` to skip `NA` values? I want to keep the `NA` because it is linked to other columns. – William Oct 12 '20 at 02:12