I have a fairly large list of lists representing the tokens in the Sogou text classification data set. I can process the entire training set of 450 000 with 12 gigs of ram left over, but when I call numpy.save() on the list of lists the memory usage seems to double and I run out of memory.
Why is this? Does the numpy.save convert the list before saving but retain the list thus using more memory?
Is there an alternative way to save this list of lists i.e pickling? I believe numpy save uses the pickle protocol judging from the allow pickle argument: https://docs.scipy.org/doc/numpy/reference/generated/numpy.save.html
print "Collecting Raw Documents, tokenize, and remove stop words"
df = pd.read_pickle(path + dataSetName + "Train")
frequency = defaultdict(int)
gen_docs = []
totalArts = len(df)
for artNum in range(totalArts):
if artNum % 2500 == 0:
print "Gen Docs Creation on " + str(artNum) + " of " + str(totalArts)
bodyText = df.loc[artNum,"fullContent"]
bodyText = re.sub('<[^<]+?>', '', str(bodyText))
bodyText = re.sub(pun, " ", str(bodyText))
tmpDoc = []
for w in word_tokenize(bodyText):
w = w.lower().decode("utf-8", errors="ignore")
#if w not in STOPWORDS and len(w) > 1:
if len(w) > 1:
#w = wordnet_lemmatizer.lemmatize(w)
w = re.sub(num, "number", w)
tmpDoc.append(w)
frequency[w] += 1
gen_docs.append(tmpDoc)
print len(gen_docs)
del df
print "Saving unfiltered gen"
dataSetName = path + dataSetName
np.save("%s_lemmaWords_noStop_subbedNums.npy" % dataSetName, gen_docs)