Labeling Tweets from training data in Python

Question

I am trying to label the tweets into positive or negative using nltk in python. i have 3 files "train_posi_tweets.txt" contains 4000 positive tweets "train_nega_tweets.txt" which contains 8000 negative tweets and 'unlabeled_tweetss.txt' contains 51647 tweets which i need to label...one more thing tweets are in spanish

with reference to victorneo on GitHub i have this code now but it's not working can anyone help me with this???i get an error at this line "for (words, sentiment) in pos_tweets + neg_tweets : too many values to unpack exception"

# -*- coding: utf-8 -*-
"""
Created on Fri May 16 16:34:46 2014

@author: shyam
"""
import nltk
import json
from nltk.classify.naivebayes import NaiveBayesClassifier
import re


def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
      all_words.extend(words)
    return all_words


def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features


def read_tweets(fname, t_type):
    tweets = []
    f = open(fname, 'r')
    for line in f.readlines():
        tweet = json.loads(line)
        text = tweet['text'].strip().encode('ascii', errors='ignore')
        text = re.sub(r"\n", " ", text) # remove newlines from text
        tweets.append(text)
    f.close()
    return tweets

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
      features['contains(%s)' % word] = (word in document_words)
    return features


def classify_tweet(tweet):
    return \
        classifier.classify(extract_features(nltk.word_tokenize(tweet)))


# read in postive and negative training tweets
pos_tweets = read_tweets('train_posi_tweets.txt', 'positive')
neg_tweets = read_tweets('train_nega_tweets.txt', 'negative')

# filter away words that are less than 3 letters to form the training data

tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
    tweets.append((words_filtered, sentiment))


# extract the word features out from the training data
word_features = get_word_features(\
                    get_words_in_tweets(tweets))


# get the training set and train the Naive Bayes Classifier
training_set = nltk.classify.util.apply_features(extract_features, tweets)
classifier = NaiveBayesClassifier.train(training_set)


# read in the test tweets and check accuracy
# to add your own test tweets, add them in the respective files
test_tweets = read_tweets('unlabeled_tweetss.txt', 'unlabled')
total = accuracy = float(len(test_tweets))

for tweet in test_tweets:
    if classify_tweet(tweet[0]) != tweet[1]:
        accuracy -= 1

print('Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy))

I get an error at this line "for (words, sentiment) in pos_tweets + neg_tweets : too many values to unpack exception" — Asav Patel, May 17 '14 at 14:15
in your `read_tweets` function, you are passing it a file `fname` as well as `t_type`. `t_type` doesn't appear anywhere in your function. try `return tweets, t_type` — o-90, May 18 '14 at 04:32

Labeling Tweets from training data in Python

0 Answers0