Ive been reusing the following code, however for my output 'Most informative features' I get incorrectly labelled features. Do you think this is a data-encoding problem from my (self-made) corpus?
import csv
import nltk
from nltk.classify.util import apply_features
from nltk.corpus import stopwords
import math
import re
import sys
import os
import codecs
reload(sys)
sys.setdefaultencoding('utf-8')
customstopwords = ['show', 'they', 'them','He','She','We','i','are','this','the','so','to','me','for','and','was','in','as','about']
#Loads the sentiment files
p = open('Positivetweets50.txt', 'r')
postxt = p.readlines()
n = open('Negativetweets50.txt', 'r')
negtxt = n.readlines()
neglist = []
poslist = []
#creates a list of sentiment files with the same length of the sentiment tweet list.
for i in range(0,len(negtxt)):
neglist.append('negative')
for i in range(0,len(postxt)):
poslist.append('positive')
#creates a tuple list with sentiment tagged at the end of sentences.
postagged = zip(postxt, poslist)
negtagged = zip(negtxt, neglist)
#appends all the tagged tweets to a common list
taggedtweets = postagged + negtagged
print taggedtweets
tweets = []
#creates a list of words with sentiments.
for (word, sentiment) in taggedtweets:
word_filter = [i.lower() for i in word.split()]
tweets.append((word_filter, sentiment))
#Pulls out all the words in a list of tagged tweets.
def getwords(tweets):
allwords = []
for (words, sentiment) in tweets:
allwords.extend(words)
return allwords
#uses nltk library to order the list of tweets words pulled out by their frequency.
def getwordfeatures(listoftweets):
wordfreq = nltk.FreqDist(listoftweets)
words = wordfreq.keys()
return words
#calls the baove functions to provide the list of words excluding the custom and stop words, ordered by frequency
print getwordfeatures(getwords(tweets))
wordlist = getwordfeatures(getwords(tweets))
def feature_extractor(doc):
docwords = set(doc)
features = {}
for i in wordlist:
features['contains(%s)' % i] = (i in docwords)
return features
#creates the training set to classify on the basis of distribution of true and false in the input.
training_set = nltk.classify.util.apply_features(feature_extractor, tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)
print classifier.show_most_informative_features(n=1000)
print 'accuracy:', nltk.classify.util.accuracy(classifier, training_set)
The output:
Most Informative Features
contains(tom) = True negati : positi = 1.0 : 1.0
contains(thrown) = True negati : positi = 1.0 : 1.0
contains("""joined) = True negati : positi = 1.0 : 1.0
contains(tokyo) = True negati : positi = 1.0 : 1.0
contains(@christophery) = True negati : positi = 1.0 : 1.0
contains(won't) = True negati : positi = 1.0 : 1.0
contains("""@edisonneil) = True negati : positi = 1.0 : 1.0
contains(husband's) = True negati : positi = 1.0 : 1.0
contains(come!!) = True negati : positi = 1.0 : 1.0
contains(hair!!!) = True negati : positi = 1.0 : 1.0
contains(accountant) = True negati : positi = 1.0 : 1.0
contains(giggles) = True negati : positi = 1.0 : 1.0
contains(bigger) = True negati : positi = 1.0 : 1.0
contains(that?) = True negati : positi = 1.0 : 1.0
contains(they'd) = True negati : positi = 1.0 : 1.0
contains("""@jerinelay) = True negati : positi = 1.0 : 1.0
contains(launched) = True negati : positi = 1.0 : 1.0
contains(nina) = True negati : positi = 1.0 : 1.0
contains(htc) = True negati : positi = 1.0 : 1.0
contains(hmmmm) = True negati : positi = 1.0 : 1.0
contains("""@chele76) = True negati : positi = 1.0 : 1.0
contains(buying) = True negati : positi = 1.0 : 1.0
contains(teaches) = True negati : positi = 1.0 : 1.0
contains(heaven) = True negati : positi = 1.0 : 1.0
contains(old!) = True negati : positi = 1.0 : 1.0
contains(flipping) = True negati : positi = 1.0 : 1.0
contains(cal) = True negati : positi = 1.0 : 1.0
contains(roosevelt) = True negati : positi = 1.0 : 1.0
contains(wat) = True negati : positi = 1.0 : 1.0
contains(tribe) = True negati : positi = 1.0 : 1.0
contains(be!) = True negati : positi = 1.0 : 1.0
contains("""amazing) = True negati : positi = 1.0 : 1.0
contains(stairs) = True negati : positi = 1.0 : 1.0
contains(podcasts) = True negati : positi = 1.0 : 1.0
contains(pound) = True negati : positi = 1.0 : 1.0
contains(tomorrow...) = True negati : positi = 1.0 : 1.0
contains(months!) = True negati : positi = 1.0 : 1.0
contains(wana) = True negati : positi = 1.0 : 1.0
contains(impact) = True negati : positi = 1.0 : 1.0
contains(texted) = True negati : positi = 1.0 : 1.0
contains(vampire) = True negati : positi = 1.0 : 1.0
contains("""@dionrodrigues) = True negati : positi = 1.0 : 1.0
contains(kind) = True negati : positi = 1.0 : 1.0
contains(sheesh.) = True negati : positi = 1.0 : 1.0
contains(pictures.) = True negati : positi = 1.0 : 1.0
contains(breeze) = True negati : positi = 1.0 : 1.0
contains(@amrosario) = True negati : positi = 1.0 : 1.0
contains(wells.) = True negati : positi = 1.0 : 1.0
contains(gave) = True negati : positi = 1.0 : 1.0
contains(soul.) = True negati : positi = 1.0 : 1.0
contains(addy) = True negati : positi = 1.0 : 1.0
contains(soooooo) = True negati : positi = 1.0 : 1.0
contains("""@j") = True negati : positi = 1.0 : 1.0
contains(coz) = True negati : positi = 1.0 : 1.0
contains(quick) = True negati : positi = 1.0 : 1.0
contains(did.) = True negati : positi = 1.0 : 1.0
contains(humor.) = True negati : positi = 1.0 : 1.0
contains(@b_club) = True negati : positi = 1.0 : 1.0
contains("""@julieunplugged) = True negati : positi = 1.0 : 1.0
contains(fire) = True negati : positi = 1.0 : 1.0
contains(@angusi) = True negati : positi = 1.0 : 1.0
contains(bff.) = True negati : positi = 1.0 : 1.0
contains(page.) = True negati : positi = 1.0 : 1.0
contains(took""") = True negati : positi = 1.0 : 1.0
contains(returned) = True negati : positi = 1.0 : 1.0
contains(hello!) = True negati : positi = 1.0 : 1.0
contains(friday!!!!) = True negati : positi = 1.0 : 1.0
contains(creepy""") = True negati : positi = 1.0 : 1.0
contains(farewell""") = True negati : positi = 1.0 : 1.0
contains(awsome""") = True negati : positi = 1.0 : 1.0
contains(late..) = True negati : positi = 1.0 : 1.0
contains(@calmbanana) = True negati : positi = 1.0 : 1.0
contains(huge) = True negati : positi = 1.0 : 1.0
contains(window) = True negati : positi = 1.0 : 1.0
contains(complete) = True negati : positi = 1.0 : 1.0
contains(question?) = True negati : positi = 1.0 : 1.0
contains(from""") = True negati : positi = 1.0 : 1.0
contains("""baby) = True negati : positi = 1.0 : 1.0
contains(right.) = True negati : positi = 1.0 : 1.0
contains(delicious) = True negati : positi = 1.0 : 1.0
contains(unreal""") = True negati : positi = 1.0 : 1.0
contains(voted) = True negati : positi = 1.0 : 1.0
contains(@bk_ii) = True negati : positi = 1.0 : 1.0
contains(@coolcatteacher) = True negati : positi = 1.0 : 1.0
contains(assessment) = True negati : positi = 1.0 : 1.0
contains(malaysian) = True negati : positi = 1.0 : 1.0
contains(french""") = True negati : positi = 1.0 : 1.0
contains(definitly) = True negati : positi = 1.0 : 1.0
contains("""@tvorse) = True negati : positi = 1.0 : 1.0
contains(m&""""""") = True negati : positi = 1.0 : 1.0
contains("""@lewisstanson) = True negati : positi = 1.0 : 1.0
contains(warm""") = True negati : positi = 1.0 : 1.0
contains(@chrishealy) = True negati : positi = 1.0 : 1.0
contains(@_dznr) = True negati : positi = 1.0 : 1.0
contains(@awesomekong) = True negati : positi = 1.0 : 1.0
contains(broken) = True negati : positi = 1.0 : 1.0
contains(get!) = True negati : positi = 1.0 : 1.0
contains(some) = True negati : positi = 1.0 : 1.0
contains(friends) = True negati : positi = 1.0 : 1.0
contains(ipod""") = True negati : positi = 1.0 : 1.0
contains("""@jlsofficial) = True negati : positi = 1.0 : 1.0
contains(@dayngr) = True negati : positi = 1.0 : 1.0
contains("""headed) = True negati : positi = 1.0 : 1.0
contains(:-p) = True negati : positi = 1.0 : 1.0
None
accuracy: 1.0
Corpus: https://www.dropbox.com/s/rh2bykig7eh1zq6/Positivetweets50.txt?dl=0 and https://www.dropbox.com/s/bvy2libmen57n25/Negativetweets50.txt?dl=0
Any help would be appreciated.