I use gensim to build dictionary from a collection of documents. Each document is a list of tokens. this my code
def constructModel(self, docTokens):
""" Given document tokens, constructs the tf-idf and similarity models"""
#construct dictionary for the BOW (vector-space) model : Dictionary = a mapping between words and their integer ids = collection of (word_index,word_string) pairs
#print "dictionary"
self.dictionary = corpora.Dictionary(docTokens)
# prune dictionary: remove words that appear too infrequently or too frequently
print "dictionary size before filter_extremes:",self.dictionary#len(self.dictionary.values())
#self.dictionary.filter_extremes(no_below=1, no_above=0.9, keep_n=100000)
#self.dictionary.compactify()
print "dictionary size after filter_extremes:",self.dictionary
#construct the corpus bow vectors; bow vector = collection of (word_id,word_frequency) pairs
corpus_bow = [self.dictionary.doc2bow(doc) for doc in docTokens]
#construct the tf-idf model
self.model = models.TfidfModel(corpus_bow,normalize=True)
corpus_tfidf = self.model[corpus_bow] # first transform each raw bow vector in the corpus to the tfidf model's vector space
self.similarityModel = similarities.MatrixSimilarity(corpus_tfidf) # construct the term-document index
my question is how to add a new doc (tokens) to this dictionary and update it. I searched in gensim documents but I didn't find a solution