Thank's to Anurag Wagh advice I figured it out.
I used this tutorial about gensim and how to use it in many ways.
Chapter 18 does what I was asking for, but during my test, I found out a better way to achieve my goal.
Chatper 11 shows how to build an LDA model and how to extract a list of main topics among a set of documents.
Here is my code used to build the LDA model
# Step 0: Import packages and stopwords
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.utils import simple_preprocess, lemmatize
from nltk.corpus import stopwords
from gensim import corpora
import re
import nltk
import string
import pattern
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
docs = [doc for doc in open('file.txt', encoding='utf-8')]
import nltk
import string
import pattern
# dictionary of Italian stop-words
it_stop_words = nltk.corpus.stopwords.words('italian')
it_stop_words = it_stop_words + [<custom stop words>]
# Snowball stemmer with rules for the Italian language
ita_stemmer = nltk.stem.snowball.ItalianStemmer()
# the following function is just to get the lemma
# out of the original input word
def lemmatize_word(input_word):
in_word = input_word
word_it = pattern.it.parse(
in_word,
tokenize=False,
tag=False,
chunk=False,
lemmata=True
)
the_lemmatized_word = word_it.split()[0][0][4]
return the_lemmatized_word
# Step 2: Prepare Data (Remove stopwords and lemmatize)
data_processed = []
for doc in docs:
word_tokenized_list = nltk.tokenize.word_tokenize(doc)
word_tokenized_no_punct = [x.lower() for x in word_tokenized_list if x not in string.punctuation]
word_tokenized_no_punct_no_sw = [x for x in word_tokenized_no_punct if x not in it_stop_words]
word_tokenized_no_punct_no_sw_no_apostrophe = [x.split("'") for x in word_tokenized_no_punct_no_sw]
word_tokenized_no_punct_no_sw_no_apostrophe = [y for x in word_tokenized_no_punct_no_sw_no_apostrophe for y in x]
data_processed.append(word_tokenized_no_punct_no_sw_no_apostrophe)
dct = corpora.Dictionary(data_processed)
corpus = [dct.doc2bow(line) for line in data_processed]
lda_model = LdaMulticore(corpus=corpus,
id2word=dct,
random_state=100,
num_topics=7,
passes=10,
chunksize=1000,
batch=False,
alpha='asymmetric',
decay=0.5,
offset=64,
eta=None,
eval_every=0,
iterations=100,
gamma_threshold=0.001,
per_word_topics=True)
# save the model
lda_model.save('lda_model.model')
# See the topics
lda_model.print_topics(-1)
With the model trained i can get a list of topic for each new non-conformity and detect if it's related to something already reported by others non-conformities