I am using a code piece that produces the histogram/frequency of words in a .txt file converted into corpus, but I want to search for specific key words and plot their frequency...The following code after loading the required packages, loads the text into corpus, does pre-processing/cleaning and generates a histogram of 10 most frequent words
createCorpus <- function(filepath) {
conn <- file(filepath, "r")
fulltext <- readLines(conn)
close(conn)
vs <- VectorSource(fulltext)
Corpus(vs, readerControl=list(readPlain, language="en", load=TRUE))
}
news_corpus <- createCorpus("sample.txt")
news_corpus_proc <- tm_map(news_corpus, content_transformer(tolower))
countWords <- function(filepath, pattern) {
conn <- file(filepath, "r")
fulltext <- readLines(conn)
close(conn)
count <- 0
for (i in 1:length(fulltext)) {
findr <- gregexpr(pattern, fulltext[i])
if (findr[[1]][1]>0) {
count <- count + length(findr[[1]])
}
}
count
}
totwords <- countWords(news_file, " * ") + 10148
mystopwords <- c(" [Aa]nd ", " [Ff]or ", " [Ii]n ", " [Ii]s ", " [Ii]t ",
" [Nn]ot ", " [Oo]n ", " [Tt]he ", " [Tt]o ")
totstops <- sum(sapply(mystopwords,
function(x) { countWords(news_file, x) }))
totstops/totwords
news_corpus_proc <- tm_map(news_corpus_proc, removeWords,
stopwords(kind="en"))
news_corpus_proc <- tm_map(news_corpus_proc, removePunctuation)
news_corpus_proc <- tm_map(news_corpus_proc, removeNumbers)
news_corpus_proc <- tm_map(news_corpus_proc, stripWhitespace)
dtm <- DocumentTermMatrix(news_corpus_proc)
dtm.matrix <- as.matrix(dtm)
wordcount <- colSums(dtm.matrix)
topten <- head(sort(wordcount, decreasing=TRUE), 10)
dfplot <- as.data.frame(melt(topten))
dfplot$word <- dimnames(dfplot)[[1]]
dfplot$word <- factor(dfplot$word,
levels=dfplot$word[order(dfplot$value,
decreasing=TRUE)])
fig <- ggplot(dfplot, aes(x=word, y=value)) + geom_bar(stat="identity")
fig <- fig + xlab("Word in Corpus")
fig <- fig + ylab("Count")
print(fig)