I'm working using a cluster to generate word2vec models using gensim from sentences from medical journals that are stored in JSON files and I'm having trouble with memory usage being too large.
The task is to keep a cumulative list of all sentences up to a particular year, and then generate a word2vec model for that year. Then, add the sentences for the next year to the cumulative list and generate and save another model for that year based on all the sentences.
The I/O on this particular cluster is slow enough and the data large enough (reading 2/3 into memory takes about 3 days) that streaming each JSON from disk for each year's model would have taken forever, so the solution was to load all 90GB of JSON into memory in a python list. I have permission to use up to 256GB of memory for this, but could get more if necessary.
The trouble I'm having is that I'm running out of memory. I have read some other posts about the way Python implements free lists not returning memory to the OS and I think that may be part of the problem, but I am not sure.
Thinking that the free list might be the problem and that maybe a numpy would have a better implementation for a large number of elements, I changed from the cumulative list of sentences to a cumulative array of sentences (gensim requires that sentences be lists of words/strings). But I ran this on a small subset of the sentences and it used slightly more memory, so I am unsure of how to proceed.
If anyone has any experience with this I would be very happy to have your help. Also, if there is anything else that could be changed I would appreciate you telling me as well. The full code is below:
import ujson as json
import os
import sys
import logging
from gensim.models import word2vec
import numpy as np
PARAMETERS = {
'numfeatures': 250,
'minwordcount': 10,
'context': 7,
'downsampling': 0.0001,
'workers': 32
}
logger = logging.getLogger()
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
def generate_and_save_model(cumulative_sentences, models_dir, year):
"""
Generates and saves the word2vec model for the given year
:param cumulative_sentences: The list of all sentences up to the current year
:param models_dir: The directory to save the models to
:param year: The current year of interest
:return: Nothing, only saves the model to disk
"""
cumulative_model = word2vec.Word2Vec(
sentences=cumulative_sentences,
workers=PARAMETERS['workers'],
size=PARAMETERS['numfeatures'],
min_count=PARAMETERS['minwordcount'],
window=PARAMETERS['context'],
sample=PARAMETERS['downsampling']
)
cumulative_model.init_sims(replace=True)
cumulative_model.save(models_dir + 'medline_abstract_word2vec_' + year)
def save_year_models(json_list, json_dir, models_dir, min_year, max_year):
"""
:param json_list: The list of json year_sentences file names
:param json_dir: The directory holding the the sentences json files
:param models_dir: The directory to serialize the models to
:param min_year: The minimum value of a year to generate a model for
:param max_year: The maximum value of a year to generate a model for
Goes year by year through each json of sentences, saving a cumulative word2vec
model for each year
"""
cumulative_sentences = np.array([])
for json_file in json_list:
year = json_file[16:20]
# If this year is greater than the maximum, we're done creating models
if int(year) > max_year:
break
with open(json_dir + json_file, 'rb') as current_year_file:
cumulative_sentences = np.concatenate(
(np.array(json.load(current_year_file)['sentences']),
cumulative_sentences)
)
logger.info('COMPLETE: ' + year + ' sentences loaded')
logger.info('Cumulative length: ' + str(len(cumulative_sentences)) + ' sentences loaded')
sys.stdout.flush()
# If this year is less than our minimum, add its sentences to the list and continue
if int(year) < min_year:
continue
generate_and_save_model(cumulative_sentences, models_dir, year)
logger.info('COMPLETE: ' + year + ' model saved')
sys.stdout.flush()
def main():
json_dir = '/projects/chemotext/sentences_by_year/'
models_dir = '/projects/chemotext/medline_year_models/'
# By default, generate models for all years we have sentences for
minimum_year = 0
maximum_year = 9999
# If one command line argument is used
if len(sys.argv) == 2:
# Generate the model for only that year
minimum_year = int(sys.argv[1])
maximum_year = int(sys.argv[1])
# If two CL arguments are used
if len(sys.argv) == 3:
# Generate all models between the two year arguments, inclusive
minimum_year = int(sys.argv[1])
maximum_year = int(sys.argv[2])
# Sorting the list of files so that earlier years are first in the list
json_list = sorted(os.listdir(json_dir))
save_year_models(json_list, json_dir, models_dir, minimum_year, maximum_year)
if __name__ == '__main__':
main()