NLTK Naive Bayes Classifer Sentiment Inccorect Feature Selection

Question

Ive been reusing the following code, however for my output 'Most informative features' I get incorrectly labelled features. Do you think this is a data-encoding problem from my (self-made) corpus?

import csv
import nltk
from nltk.classify.util import apply_features
from nltk.corpus import stopwords
import math
import re
import sys
import os
import codecs
reload(sys)
sys.setdefaultencoding('utf-8')

customstopwords = ['show', 'they', 'them','He','She','We','i','are','this','the','so','to','me','for','and','was','in','as','about']

#Loads the sentiment files
p = open('Positivetweets50.txt', 'r')
postxt = p.readlines()

n = open('Negativetweets50.txt', 'r')
negtxt = n.readlines()

neglist = []
poslist = []

#creates a list of sentiment files with the same length of the sentiment tweet list.

for i in range(0,len(negtxt)):
    neglist.append('negative')


for i in range(0,len(postxt)):
    poslist.append('positive')

#creates a tuple list with sentiment tagged at the end of sentences.
postagged = zip(postxt, poslist)
negtagged = zip(negtxt, neglist)

#appends all the tagged tweets to a common list
taggedtweets = postagged + negtagged

print taggedtweets 

tweets = []

#creates a list of words with sentiments.
for (word, sentiment) in taggedtweets:
    word_filter = [i.lower() for i in word.split()]
    tweets.append((word_filter, sentiment))

#Pulls out all the words in a list of tagged tweets.
def getwords(tweets):
    allwords = []
    for (words, sentiment) in tweets:
        allwords.extend(words)
    return allwords

#uses nltk library to order the list of tweets words pulled out by their frequency.
def getwordfeatures(listoftweets):
    wordfreq = nltk.FreqDist(listoftweets)
    words = wordfreq.keys()
    return words    

#calls the baove functions to provide the list of words excluding the custom and stop words, ordered by frequency

print getwordfeatures(getwords(tweets))

wordlist = getwordfeatures(getwords(tweets))

def feature_extractor(doc):
    docwords = set(doc)
    features = {}
    for i in wordlist:
        features['contains(%s)' % i] = (i in docwords)
    return features

#creates the training set to classify on the basis of distribution of true and false in the input.
training_set = nltk.classify.util.apply_features(feature_extractor, tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set) 

print classifier.show_most_informative_features(n=1000)
print 'accuracy:', nltk.classify.util.accuracy(classifier, training_set)

The output:

Most Informative Features
           contains(tom) = True           negati : positi =      1.0 : 1.0
        contains(thrown) = True           negati : positi =      1.0 : 1.0
     contains("""joined) = True           negati : positi =      1.0 : 1.0
         contains(tokyo) = True           negati : positi =      1.0 : 1.0
 contains(@christophery) = True           negati : positi =      1.0 : 1.0
         contains(won't) = True           negati : positi =      1.0 : 1.0
contains("""@edisonneil) = True           negati : positi =      1.0 : 1.0
     contains(husband's) = True           negati : positi =      1.0 : 1.0
        contains(come!!) = True           negati : positi =      1.0 : 1.0
       contains(hair!!!) = True           negati : positi =      1.0 : 1.0
    contains(accountant) = True           negati : positi =      1.0 : 1.0
       contains(giggles) = True           negati : positi =      1.0 : 1.0
        contains(bigger) = True           negati : positi =      1.0 : 1.0
         contains(that?) = True           negati : positi =      1.0 : 1.0
        contains(they'd) = True           negati : positi =      1.0 : 1.0
 contains("""@jerinelay) = True           negati : positi =      1.0 : 1.0
      contains(launched) = True           negati : positi =      1.0 : 1.0
          contains(nina) = True           negati : positi =      1.0 : 1.0
           contains(htc) = True           negati : positi =      1.0 : 1.0
         contains(hmmmm) = True           negati : positi =      1.0 : 1.0
   contains("""@chele76) = True           negati : positi =      1.0 : 1.0
        contains(buying) = True           negati : positi =      1.0 : 1.0
       contains(teaches) = True           negati : positi =      1.0 : 1.0
        contains(heaven) = True           negati : positi =      1.0 : 1.0
          contains(old!) = True           negati : positi =      1.0 : 1.0
      contains(flipping) = True           negati : positi =      1.0 : 1.0
           contains(cal) = True           negati : positi =      1.0 : 1.0
     contains(roosevelt) = True           negati : positi =      1.0 : 1.0
           contains(wat) = True           negati : positi =      1.0 : 1.0
         contains(tribe) = True           negati : positi =      1.0 : 1.0
           contains(be!) = True           negati : positi =      1.0 : 1.0
    contains("""amazing) = True           negati : positi =      1.0 : 1.0
        contains(stairs) = True           negati : positi =      1.0 : 1.0
      contains(podcasts) = True           negati : positi =      1.0 : 1.0
         contains(pound) = True           negati : positi =      1.0 : 1.0
   contains(tomorrow...) = True           negati : positi =      1.0 : 1.0
       contains(months!) = True           negati : positi =      1.0 : 1.0
          contains(wana) = True           negati : positi =      1.0 : 1.0
        contains(impact) = True           negati : positi =      1.0 : 1.0
        contains(texted) = True           negati : positi =      1.0 : 1.0
       contains(vampire) = True           negati : positi =      1.0 : 1.0
contains("""@dionrodrigues) = True           negati : positi =      1.0 : 1.0
          contains(kind) = True           negati : positi =      1.0 : 1.0
       contains(sheesh.) = True           negati : positi =      1.0 : 1.0
     contains(pictures.) = True           negati : positi =      1.0 : 1.0
        contains(breeze) = True           negati : positi =      1.0 : 1.0
    contains(@amrosario) = True           negati : positi =      1.0 : 1.0
        contains(wells.) = True           negati : positi =      1.0 : 1.0
          contains(gave) = True           negati : positi =      1.0 : 1.0
         contains(soul.) = True           negati : positi =      1.0 : 1.0
          contains(addy) = True           negati : positi =      1.0 : 1.0
       contains(soooooo) = True           negati : positi =      1.0 : 1.0
        contains("""@j") = True           negati : positi =      1.0 : 1.0
           contains(coz) = True           negati : positi =      1.0 : 1.0
         contains(quick) = True           negati : positi =      1.0 : 1.0
          contains(did.) = True           negati : positi =      1.0 : 1.0
        contains(humor.) = True           negati : positi =      1.0 : 1.0
       contains(@b_club) = True           negati : positi =      1.0 : 1.0
contains("""@julieunplugged) = True           negati : positi =      1.0 : 1.0
          contains(fire) = True           negati : positi =      1.0 : 1.0
       contains(@angusi) = True           negati : positi =      1.0 : 1.0
          contains(bff.) = True           negati : positi =      1.0 : 1.0
         contains(page.) = True           negati : positi =      1.0 : 1.0
       contains(took""") = True           negati : positi =      1.0 : 1.0
      contains(returned) = True           negati : positi =      1.0 : 1.0
        contains(hello!) = True           negati : positi =      1.0 : 1.0
    contains(friday!!!!) = True           negati : positi =      1.0 : 1.0
     contains(creepy""") = True           negati : positi =      1.0 : 1.0
   contains(farewell""") = True           negati : positi =      1.0 : 1.0
     contains(awsome""") = True           negati : positi =      1.0 : 1.0
        contains(late..) = True           negati : positi =      1.0 : 1.0
   contains(@calmbanana) = True           negati : positi =      1.0 : 1.0
          contains(huge) = True           negati : positi =      1.0 : 1.0
        contains(window) = True           negati : positi =      1.0 : 1.0
      contains(complete) = True           negati : positi =      1.0 : 1.0
     contains(question?) = True           negati : positi =      1.0 : 1.0
       contains(from""") = True           negati : positi =      1.0 : 1.0
       contains("""baby) = True           negati : positi =      1.0 : 1.0
        contains(right.) = True           negati : positi =      1.0 : 1.0
     contains(delicious) = True           negati : positi =      1.0 : 1.0
     contains(unreal""") = True           negati : positi =      1.0 : 1.0
         contains(voted) = True           negati : positi =      1.0 : 1.0
        contains(@bk_ii) = True           negati : positi =      1.0 : 1.0
contains(@coolcatteacher) = True           negati : positi =      1.0 : 1.0
    contains(assessment) = True           negati : positi =      1.0 : 1.0
     contains(malaysian) = True           negati : positi =      1.0 : 1.0
     contains(french""") = True           negati : positi =      1.0 : 1.0
     contains(definitly) = True           negati : positi =      1.0 : 1.0
    contains("""@tvorse) = True           negati : positi =      1.0 : 1.0
  contains(m&amp""""""") = True           negati : positi =      1.0 : 1.0
contains("""@lewisstanson) = True           negati : positi =      1.0 : 1.0
       contains(warm""") = True           negati : positi =      1.0 : 1.0
   contains(@chrishealy) = True           negati : positi =      1.0 : 1.0
        contains(@_dznr) = True           negati : positi =      1.0 : 1.0
  contains(@awesomekong) = True           negati : positi =      1.0 : 1.0
        contains(broken) = True           negati : positi =      1.0 : 1.0
          contains(get!) = True           negati : positi =      1.0 : 1.0
          contains(some) = True           negati : positi =      1.0 : 1.0
       contains(friends) = True           negati : positi =      1.0 : 1.0
       contains(ipod""") = True           negati : positi =      1.0 : 1.0
contains("""@jlsofficial) = True           negati : positi =      1.0 : 1.0
       contains(@dayngr) = True           negati : positi =      1.0 : 1.0
     contains("""headed) = True           negati : positi =      1.0 : 1.0
           contains(:-p) = True           negati : positi =      1.0 : 1.0
None
accuracy: 1.0

Corpus: https://www.dropbox.com/s/rh2bykig7eh1zq6/Positivetweets50.txt?dl=0 and https://www.dropbox.com/s/bvy2libmen57n25/Negativetweets50.txt?dl=0

Any help would be appreciated.

possible duplicate of [nltk NaiveBayesClassifier training for sentiment analysis](http://stackoverflow.com/questions/20827741/nltk-naivebayesclassifier-training-for-sentiment-analysis) — alvas, Sep 29 '14 at 13:34

score 0 · Answer 1 · answered Sep 29 '14 at 13:33

try:

from nltk import NaiveBayesClassifier as nbc
from nltk.tokenize import word_tokenize
from itertools import chain

training_data = [('I love this sandwich.', 'pos'),
('This is an amazing place!', 'pos'),
('I feel very good about these beers.', 'pos'),
('This is my best work.', 'pos'),
("What an awesome view", 'pos'),
('I do not like this restaurant', 'neg'),
('I am tired of this stuff.', 'neg'),
("I can't deal with this", 'neg'),
('He is my sworn enemy!', 'neg'),
('My boss is horrible.', 'neg')]

vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data]))

feature_set = [({i:(i in word_tokenize(sentence.lower())) for i in vocabulary},tag) for sentence, tag in training_data]

classifier = nbc.train(feature_set)

test_sentence = "This is the best band I've ever heard!"
featurized_test_sentence =  {i:(i in word_tokenize(test_sentence.lower())) for i in vocabulary}

print "test_sent:",test_sentence
print "tag:",classifier.classify(featurized_test_sentence)

NLTK Naive Bayes Classifer Sentiment Inccorect Feature Selection

1 Answers1