I am using LDA over a simple collection of documents. my goal is to extract topics, then use the extracted topics as features to evaluate my model.
I decided to use multinomial SVM as the evaluater. not sure its good or not?
import itertools
from gensim.models import ldamodel
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
from sklearn.naive_bayes import MultinomialNB
tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop = {'a'}
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."
# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]
# list for tokenized documents in loop
texts = []
# loop through document list
for i in doc_set:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
# add tokens to list
texts.append(stemmed_tokens)
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
# generate LDA model
#ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=20)
id2word = corpora.Dictionary(texts)
# Creates the Bag of Word corpus.
mm = [id2word.doc2bow(text) for text in texts]
# Trains the LDA models.
lda = ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=4,
update_every=1, chunksize=10000, passes=1)
# Assigns the topics to the documents in corpus
a=[]
lda_corpus = lda[mm]
for i in range(len(doc_set)):
a.append(lda_corpus[i])
print(lda_corpus[i])
merged_list = list(itertools.chain(*lda_corpus))
print(a)
#my_list.append(my_list[i])
sv=MultinomialNB()
yvalues = [0,1,2,3]
sv.fit(a,yvalues)
predictclass = sv.predict(a)
testLables=[0,1,2,3]
from sklearn import metrics, tree
#yacc=metrics.accuracy_score(testLables,predictclass)
#print (yacc)
when I run this code it throws the error mentioned in the subject.
Also this is the output of LDA model(topic doc distribution) that I feed to SVM:
[[(0, 0.95533888404477663), (1, 0.014775921798986477), (2, 0.015161897773308793), (3, 0.014723296382928375)], [(0, 0.019079556242721694), (1, 0.017932434792585779), (2, 0.94498655991579728), (3, 0.018001449048895311)], [(0, 0.017957955483631164), (1, 0.017900184473362918), (2, 0.018133572636989413), (3, 0.9460082874060165)], [(0, 0.96554611572184923), (1, 0.011407838337200715), (2, 0.011537900721487016), (3, 0.011508145219463113)], [(0, 0.023306931039431281), (1, 0.022823706054846005), (2, 0.93072240824085961), (3, 0.023146954664863096)]]
My labels here are 0,1,2,3 .
I found a response here
but when I write down :
nsamples, nx, ny = a.shape
d2_train_dataset = a.reshape((nsamples,nx*ny))
According to my case, it does not work. actually a does not have shape method.
whole traceback error
Traceback (most recent call last):
File "/home/saria/PycharmProjects/TfidfLDA/test3.py", line 87, in <module>
sv.fit(a,yvalues)
File "/home/saria/tfwithpython3.6/lib/python3.5/site-packages/sklearn/naive_bayes.py", line 562, in fit
X, y = check_X_y(X, y, 'csr')
File "/home/saria/tfwithpython3.6/lib/python3.5/site-packages/sklearn/utils/validation.py", line 521, in check_X_y
ensure_min_features, warn_on_dtype, estimator)
File "/home/saria/tfwithpython3.6/lib/python3.5/site-packages/sklearn/utils/validation.py", line 405, in check_array
% (array.ndim, estimator_name))
ValueError: Found array with dim 3. Estimator expected <= 2.