2

I encountered the same problem as shown here. But their solution doesn't seem to work for me. Not sure if anyone could help me with it. Thanks.

from SentimentAnalyzer import TweetTokenizer
from SentimentAnalyzer import DataSet
import json
import re
import collections
import nltk.metrics
import nltk.classify
import pickle

tweetsTokenizer = TweetTokenizer()
featureList = []
tweets = []
dataset = DataSet()
train_data = dataset.getTrainData()
test_data = dataset.getTestData()

def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in featureList:
        features['contains(%s)' % word] = (word in tweet_words)
    return features

trainsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
NBClassifier = None

train = True
if train:
    ... some preprocessing codes above ...
    # Generate the training set
    print 'Extracting features...'
    training_set = nltk.classify.util.apply_features(extract_features, tweets)

    # Train the Naive Bayes classifier
    print 'Training dataset...'
    NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

    print 'Saving model...'
    f = open('NaiveBayesClassifier.pickle', 'wb')
    pickle.dump(NBClassifier, f)
    f.close()
else:
    f = open('NaiveBayesClassifier.pickle', 'rb')
    NBClassifier = pickle.load(f)
    f.close()

# Test the classifier
print 'Testing the model...'
for i, line in enumerate(test_data):
    tweetJson = json.loads(line)
    labelledSentiment = dataset.getTestSentiment(tweetJson['id_str']).encode('utf-8')
    trainsets[labelledSentiment].add(i)

    testTweet = tweetJson['text'].encode('utf-8')
    processedTestTweet = tweetsTokenizer.preprocess(testTweet)
    sentiment = NBClassifier.classify(extract_features(tweetsTokenizer.getFeatureVector(processedTestTweet)))
    testsets[sentiment].add(i)
    print "testTweet = %s, classified sentiment = %s, labelled sentiment = %s\n" % (testTweet, sentiment, labelledSentiment)
    # print "testTweet = %s, classified sentiment = %s, labelled sentiment = %s\n" % (testTweet, sentiment, labelledSentiment)

print 'Positive precision:', nltk.metrics.precision(trainsets['positive'], testsets['positive'])
print 'Positive recall:', nltk.metrics.recall(trainsets['positive'], testsets['positive'])
print 'Positive F-measure:', nltk.metrics.f_measure(trainsets['positive'], testsets['positive'])
print 'Negative precision:', nltk.metrics.precision(trainsets['negative'], testsets['negative'])
print 'Negative recall:', nltk.metrics.recall(trainsets['negative'], testsets['negative'])
print 'Negative F-measure:', nltk.metrics.f_measure(trainsets['negative'], testsets['negative'])
print 'Neutral precision:', nltk.metrics.precision(trainsets['neutral'], testsets['neutral'])
print 'Neutral recall:', nltk.metrics.recall(trainsets['neutral'], testsets['neutral'])
print 'Neutral F-measure:', nltk.metrics.f_measure(trainsets['neutral'], testsets['neutral'])

print 'done'

The classifier when trained and tested gives different results compared to the classifier that is directly loaded without training. I could not figure out why. Thanks.

Community
  • 1
  • 1
You Hock Tan
  • 995
  • 2
  • 9
  • 18

0 Answers0