I encountered the same problem as shown here. But their solution doesn't seem to work for me. Not sure if anyone could help me with it. Thanks.
from SentimentAnalyzer import TweetTokenizer
from SentimentAnalyzer import DataSet
import json
import re
import collections
import nltk.metrics
import nltk.classify
import pickle
tweetsTokenizer = TweetTokenizer()
featureList = []
tweets = []
dataset = DataSet()
train_data = dataset.getTrainData()
test_data = dataset.getTestData()
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features
trainsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
NBClassifier = None
train = True
if train:
... some preprocessing codes above ...
# Generate the training set
print 'Extracting features...'
training_set = nltk.classify.util.apply_features(extract_features, tweets)
# Train the Naive Bayes classifier
print 'Training dataset...'
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
print 'Saving model...'
f = open('NaiveBayesClassifier.pickle', 'wb')
pickle.dump(NBClassifier, f)
f.close()
else:
f = open('NaiveBayesClassifier.pickle', 'rb')
NBClassifier = pickle.load(f)
f.close()
# Test the classifier
print 'Testing the model...'
for i, line in enumerate(test_data):
tweetJson = json.loads(line)
labelledSentiment = dataset.getTestSentiment(tweetJson['id_str']).encode('utf-8')
trainsets[labelledSentiment].add(i)
testTweet = tweetJson['text'].encode('utf-8')
processedTestTweet = tweetsTokenizer.preprocess(testTweet)
sentiment = NBClassifier.classify(extract_features(tweetsTokenizer.getFeatureVector(processedTestTweet)))
testsets[sentiment].add(i)
print "testTweet = %s, classified sentiment = %s, labelled sentiment = %s\n" % (testTweet, sentiment, labelledSentiment)
# print "testTweet = %s, classified sentiment = %s, labelled sentiment = %s\n" % (testTweet, sentiment, labelledSentiment)
print 'Positive precision:', nltk.metrics.precision(trainsets['positive'], testsets['positive'])
print 'Positive recall:', nltk.metrics.recall(trainsets['positive'], testsets['positive'])
print 'Positive F-measure:', nltk.metrics.f_measure(trainsets['positive'], testsets['positive'])
print 'Negative precision:', nltk.metrics.precision(trainsets['negative'], testsets['negative'])
print 'Negative recall:', nltk.metrics.recall(trainsets['negative'], testsets['negative'])
print 'Negative F-measure:', nltk.metrics.f_measure(trainsets['negative'], testsets['negative'])
print 'Neutral precision:', nltk.metrics.precision(trainsets['neutral'], testsets['neutral'])
print 'Neutral recall:', nltk.metrics.recall(trainsets['neutral'], testsets['neutral'])
print 'Neutral F-measure:', nltk.metrics.f_measure(trainsets['neutral'], testsets['neutral'])
print 'done'
The classifier when trained and tested gives different results compared to the classifier that is directly loaded without training. I could not figure out why. Thanks.