1

I am conducting topic modelling for a project of mine and struggled into visualizing the results. I think the proceeding is right. Specifically when I run this line

vis = pyLDAvis.sklearn.prepare(bi_lda, bigram_vectorized, bivectorizer, mds='tsne')
pyLDAvis.show(vis)

I get this error:

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

I think it's strange and cannot figure it out because the proceeding is right and I am able to create an lda model.

The way I created the model is the following

import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os
#print(os.listdir("../input"))

# Plotly based imports for visualization
import chart_studio.plotly as py


from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff


# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
!python -m spacy download it_core_news_sm

Proceeding:

# Create a custom stopword list
custom_stop_words = []

# Add spaCy's built-in stop words to the list
custom_stop_words.extend(spacy.lang.it.stop_words.STOP_WORDS)

def spacy_tokenizer(sentence):
    # Use the Italian model to tokenize the sentence
    mytokens = nlp(sentence)

    # Use lemmatization to lowercase, strip, and remove stop words and punctuation
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in custom_stop_words and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])

    return mytokens

tqdm.pandas()
df["processed_description"] = df["content"].progress_apply(spacy_tokenizer)

# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words=custom_stop_words, lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(df["processed_description"])
# Latent Dirichlet Allocation Model
NUM_TOPICS = 10
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

The problem I encounter is here

pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

the output is always AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names' I also tried updating the library but it does not work

Instead if I plot it like this it does

svd_2d = TruncatedSVD(n_components=2)
data_2d = svd_2d.fit_transform(data_vectorized)
trace = go.Scattergl(
    x = data_2d[:,0],
    y = data_2d[:,1],
    mode = 'markers',
    marker = dict(
        color = '#FFBAD2',
        line = dict(width = 1)
    ),
    text = vectorizer.get_feature_names_out(),
    hovertext = vectorizer.get_feature_names_out(),
    hoverinfo = 'text' 
)
data = [trace]
iplot(data, filename='scatter-mode')

2 Answers2

1

Fixed in the latest version, here: https://github.com/bmabey/pyLDAvis/pull/235

Pietro
  • 347
  • 1
  • 3
  • 15
0

This error occurs while using newer version of scikit-learn >= 1.2. To fix this issue simply replace any logic involving

import pyLDAvis.sklearn
...
pyLDAvis.sklearn.prepare

with

import pyLDAvis.lda_model
...
pyLDAvis.lda_model.prepare

This should fix the issue. More background on this here.

MNA
  • 183
  • 2
  • 8