I have seen this problem on stack before but the solutions didnt work for me, see Save and Load testing classify Naive Bayes Classifier in NLTK in another method. I am baffled as to why my accuracy is far different when I load the pickled classifier as opposed to just training and classifying in the same program. The first code block is calling the pickled classifier, the second is doing all the training and classifying together. The second method gives an accuracy of 99% while the first gives 81%...
Academic_classifier= None
Academic_classifier=pickle.load(open('Academic_classifier.pickle','rb'))
tweets=[]
readdata=csv.reader(open('C:\Users\Troy\Documents\Data\Gold_test.csv','r'))
for row in readdata:
tweets.append(row)
Header = tweets[0]
tweets.pop(0)
Academic_test_tweets=tweets[:]
Tweets=[]
for (words, sentiment) in tweets:
bigram=[]
bigram_list=[]
words_filtered = [e.lower() for e in WordPunctTokenizer().tokenize(words) if len(e) >= 3]
words_filtered=[re.sub(r'(.)\1+', r'\1\1', e) for e in words_filtered if len(e)>=3]
bigram_words=bigrams(words_filtered)
for x in bigram_words:
bigram.append(x)
for (bi) in bigram:
bigram_word=bi[0]+bi[1]
bigram_list.append(bigram_word)
list_to_append=words_filtered+bigram_list
Tweets.append((list_to_append, sentiment))
Academic_test_tweets_words=Tweets[:]
word_features = get_word_features(get_words_in_tweets(Academic_test_tweets_words))
Academic_test_set = nltk.classify.apply_features(extract_features,Academic_test_tweets_words)
print(nltk.classify.accuracy(Academic_classifier, Academic_test_set), 'tweet corpus used in academic paper Sentiment Analysis on the Social Networks Using Stream Algorithms Authors: Nathan Aston, Timothy Munson, Jacob Liddle, Garrett Hartshaw, Dane Livingston, Wei Hu *compare to their accuracy of 87.5%')
As opposed to this code where I train and test the accuracy. I use the same definitions for everything so I know the problem isn't with the definitions. The only difference is the pickled classifier... what is happening?
tweets=[]
readdata=csv.reader(open('C:\Users\Troy\Documents\Data\Gold_test.csv','r'))
for row in readdata:
tweets.append(row)
Header = tweets[0]
tweets.pop(0)
Academic_test_tweets=tweets[:]
Tweets=[]
for (words, sentiment) in tweets:
bigram=[]
bigram_list=[]
words_filtered = [e.lower() for e in WordPunctTokenizer().tokenize(words) if len(e) >= 3]
words_filtered=[re.sub(r'(.)\1+', r'\1\1', e) for e in words_filtered if len(e)>=3]
bigram_words=bigrams(words_filtered)
for x in bigram_words:
bigram.append(x)
for (bi) in bigram:
bigram_word=bi[0]+bi[1]
bigram_list.append(bigram_word)
list_to_append=words_filtered+bigram_list
Tweets.append((list_to_append, sentiment))
Academic_test_tweets_words=Tweets[:]
word_features = get_word_features(get_words_in_tweets(Academic_test_tweets_words))
Academic_test_set = nltk.classify.apply_features(extract_features,Academic_test_tweets_words)
tweets=[]
readdata=csv.reader(open('C:\Users\Troy\Documents\Data\Gold_train.csv','r'))
for row in readdata:
tweets.append(row)
Header = tweets[0]
tweets.pop(0)
AcademicTweets=tweets[:]
Tweets=[]
for (words, sentiment) in tweets:
bigram=[]
bigram_list=[]
words_filtered = [e.lower() for e in WordPunctTokenizer().tokenize(words) if len(e) >= 3]
words_filtered=[re.sub(r'(.)\1+', r'\1\1', e) for e in words_filtered if len(e)>=3]
bigram_words=bigrams(words_filtered)
for x in bigram_words:
bigram.append(x)
for (bi) in bigram:
bigram_word=bi[0]+bi[1]
bigram_list.append(bigram_word)
list_to_append=words_filtered+bigram_list
Tweets.append((list_to_append, sentiment))
AcademicWords=Tweets[:]
word_features = get_word_features(get_words_in_tweets(AcademicWords))
Academic_training_set = nltk.classify.apply_features(extract_features,AcademicWords)
Academic_classifier = nltk.NaiveBayesClassifier.train(Academic_training_set)
#Negative_classifier.show_most_informative_features(1)
print(nltk.classify.accuracy(Academic_classifier, Academic_test_set), 'tweet corpus used in academic paper Sentiment Analysis on the Social Networks Using Stream Algorithms Authors: Nathan Aston, Timothy Munson, Jacob Liddle, Garrett Hartshaw, Dane Livingston, Wei Hu *compare to their accuracy of 87.5%')
pickle.dump(Academic_classifier, open('Academic_classifier.pickle','wb'))