I have calculated the bag of words for 'yelp.csv', 'yelpp.csv', 'yelpn.csv' and created the matrix of individuals dataset's word frequency. Now, I want to compare the bag of words of yelp with yelpn and check how many words in yelp appears in yelpn and their frequency and store it in a variable as matrix, then same for yelpp. The yelp contains both the positive and negative. yelpp, only the positive and yelpn, only the negative. can anyone complete the code? i donno whether this code is relevant,i hope so.
getwd()
setwd("/Users/ash/RProjects/exc")
getwd()
df <- read.csv("yelp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
strip.white = TRUE)
df
dfd<-as.character(df[,2])
dfd
df2<-as.character(df[,1])
df2
words <- readLines(system.file("stopwords", "english.dat",
package = "tm"))
s<-remove_stopwords(dfd, words, lines = TRUE)
s
print(paste("****Stopwords are removed successfully****"))
n<-removeNumbers(s)
n
t<-removePunctuation(n, preserve_intra_word_dashes = FALSE)
t
#pos
dfp <- read.csv("yelpp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
strip.white = TRUE)
dfp
dfdp<-as.character(dfp[,2])
dfdp
df2p<-as.character(dfp[,1])
df2p
wordsp <- readLines(system.file("stopwords", "english.dat",
package = "tm"))
sp<-remove_stopwords(dfdp, words, lines = TRUE)
sp
print(paste("****Stopwords are removed successfully****"))
np<-removeNumbers(sp)
np
tp<-removePunctuation(np, preserve_intra_word_dashes = FALSE)
tp
#neg
dfn <- read.csv("yelpn.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
strip.white = TRUE)
dfn
dfdn<-as.character(dfn[,2])
dfdn
df2n<-as.character(dfn[,1])
df2n
wordsn <- readLines(system.file("stopwords", "english.dat",
package = "tm"))
sn<-remove_stopwords(dfdn, words, lines = TRUE)
sn
print(paste("****Stopwords are removed successfully****"))
nn<-removeNumbers(sn)
nn
tn<-removePunctuation(nn, preserve_intra_word_dashes = FALSE)
tn
#bag
b<-bag_o_words(t, apostrophe.remove = TRUE)
b
b.mat = as.matrix(b)
b.mat
bp<-bag_o_words(tp, apostrophe.remove = TRUE)
bp
bp.mat = as.matrix(bp)
bp.mat
bn<-bag_o_words(tn, apostrophe.remove = TRUE)
bn
bn.mat = as.matrix(bn)
bn.mat
#frequent terms
frequent_terms <- freq_terms(b.mat, 2000)
frequent_terms
frequent_termsp <- freq_terms(tp, 2000)
frequent_termsp
frequent_termsn <- freq_terms(tn, 2000)
frequent_termsn