I have been strugling with NaiveBayes classification for many hours now. The file labeledTrainData.tsv consists of 3 columns and 25000 rows. The columns are id (chr), sentiment (int) and review (chr).
I try to predict the sentiment of the test data, but no matter what I do, it returns some 50/50 guesswork. The truth tables produced always look like:
prediction 0 1
0 199 201
1 0 0
What am I doing wrong? Code below
library(e1071)
library(tm)
trainData <- read.table("data/labeledTrainData.tsv", header = TRUE, stringsAsFactors = FALSE)
trainData <- trainData[sample(nrow(trainData)),]
range.train <- c(1:1600)
range.test <- c(1601:2000)
df <- trainData[1:20000,]
df$sentiment <- as.factor(df$sentiment)
df.train <- df[range.train,]
df.test <- df[range.test,]
corpus <- Corpus(VectorSource(df$review))
corpus.clean <- cleanText(corpus)
dtm <- DocumentTermMatrix(corpus.clean)
trainNB <- as.matrix(dtm[range.train,])
testNB <- as.matrix(dtm[range.test,])
system.time( classifier <- naiveBayes(trainNB, df.train$sentiment) )
system.time( prediction <- predict(classifier, testNB[,-1]) )
table(prediction, df.test$sentiment)
convert_count <- function(x) {
y <- ifelse(x > 0, 1, 0) # 1 might have to be changed to x
y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
}
cleanText <- function(text) {
text %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords('en')) %>%
tm_map(toSpace, "<br />") %>%
tm_map(toSpace, "/") %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(stemDocument)
}
toSpace <- content_transformer(function(x, pattern) {
return (
gsub(pattern, " ", x)
)
})