I have trained an LDA algorithm on a corpus , and what I'd like to do is getting for each sentence the topic on which it corresponds, in order, to make a comparison between what the algorithm finds and the labels I have.
I have tried with the code below, but the results are quite bad I find a great deal of topic 17 (maybe 25% of the volume, it should be closer to 5%)
Thanks for your help
# text lemmatized: list of string lemmatized
dico = Dictionary(texts_lemmatized)
corpus_lda = [dico.doc2bow(text) for text in texts_lemmatized]
lda_ = LdaModel(corpus_lda, num_topics=18)
df_ = pd.DataFrame([])
data = []
# theme_commentaire = label of the string
for i in range(0, len(theme_commentaire)):
# lda_.get_document_topics() gives the distribution of all topic for a specific sentence
algo = max(lda_.get_document_topics(corpus_lda[i]))[0]
human = theme_commentaire[i]
data.append([str(algo), human])
cols = ['algo', 'human']
df_ = pd.DataFrame(data, columns=cols)
df_.head()