Fetching data from url:
suppressMessages(library(readr))
suppressMessages(library(RCurl))
amazon_url <- getURL('http://s3.amazonaws.com/assets.datacamp.com/production/course_935/datasets/500_amzn.csv',
ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
amazon <- read.csv(textConnection(amazon_url), header = TRUE)
Create amzn_cons
:
amazon_cons <- amazon$cons
Build cleaning function based qdap
package for text organization:
suppressWarnings(library(qdap))
qdap_clean <- function(x) {
x <- replace_abbreviation(x)
x <- replace_contraction(x)
x <- replace_number(x)
x <- replace_ordinal(x)
x <- replace_symbol(x)
x <- tolower(x)
return(x)
}
Build cleaning function based on tm
package for text organization:
suppressWarnings(library(tm))
tm_clean <- function(corpus) {
corpus<- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords,
c(stopwords("en"), "Amazon","company"))
return(corpus)
}
Word cleaning:
amzn_cons <- qdap_clean(amazon_cons)
amzn_cons <- VCorpus(VectorSource(amzn_cons))
amzn_cons_corp <- tm_clean(amzn_cons)
Build custom function to extract bigram features:
suppressWarnings(library(RWeka))
tokenizer <- function(x)
NGramTokenizer(x, Weka_control(min = 2, max = 2))
Apply tokenization function to get bigrams words:
amzn_c_tdm <- TermDocumentMatrix(
amzn_cons_corp,control = list(tokenize = tokenizer) )
This results in the following error:
Error in .jcall("RWekaInterfaces", "[S", "tokenize", .jcast(tokenizer, :
java.lang.NullPointerException
How to solve this error?