I'm doing classification analysis using NLTK's Naive Bayes classifier. I insert a tsv file containing records and labels.
But the file doesn't get trained due to an error. Here's my python code
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('tweets.txt', delimiter ='\t', quoting = 3)
dataset.isnull().any()
dataset = dataset.fillna(method='ffill')
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0,16004):
tweet = re.sub('[^a-zA-Z]', ' ', dataset['tweet'][i])
tweet = tweet.lower()
tweet = tweet.split()
ps = PorterStemmer()
tweet = [ps.stem(word) for word in tweet if not word in
set(stopwords.words('english'))]
tweet = ' '.join(tweet)
corpus.append(tweet)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 10000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,
random_state = 0)
train_set, test_set = X_train[500:], y_train[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
The error is:
File "C:\Users\HSR\Anaconda2\lib\site-packages\nltk\classify\naivebayes.py", line 194, in train
for featureset, label in labeled_featuresets:
ValueError: too many values to unpack