(1)My goal: I am trying to use SVM to classify 10000 documents(each with 400 words) into 10 classes(evenly distributed). The features explored in my work include word n-gram (n=1~4),character n-gram(n=1~6).
(2)My approach: I am representing each document using vectors of frequency values for each feature in the document. And using TF-IDF to formalize the vectors. parts of my code are below:
def commonVec(dicts,count1,count2):
''' put features with frequency between count1 and count2 into a common vector used for SVM training'''
global_vector = []
master = {}
for i, d in enumerate(dicts):
for k in d:
master.setdefault(k, []).append(i)
for key in master:
if (len(master[key])>=count1 and len(master[key])<=count2):
global_vector.append(key)
global_vector1 = sorted(global_vector)
return global_vector1
def featureComb(mix,count1,count2,res1):
'''combine word n-gram and character n-gram into a vector'''
if mix[0]:
common_vector1 = []
for i in mix[0]:
dicts1 = []
for res in res1: #I stored all documents into database. res1 is the document result set and res is each document.
dicts1.append(ngram.characterNgrams(res[1], i)) # characterNgrams()will return a dictionary with feature name as the key, frequency as the value.
common_vector1.extend(commonVec(dicts1, count1, count2))
else:
common_vector1 = []
if mix[1]:
common_vector2 = []
for j in mix[1]:
dicts2 = []
for res in res1:
dicts2.append(ngram.wordNgrams(res[1], j))
common_vector2.extend(commonVec(dicts2, count1, count2))
else:
common_vector2 = []
return common_vector1+common_vector2
def svmCombineVector(mix,global_combine,label,X,y,res1):
'''Construct X vector that can be used to train SVM'''
lstm = []
for res in res1:
y.append(label[res[0]]) # insert class label into y
dici1 = {}
dici2 = {}
freq_term_vector = []
for i in mix[0]:
dici1.update(ngram.characterNgrams(res[1], i))
freq_term_vector.extend(dici1[gram] if gram in dici1 else 0 for gram in global_combine)
for j in mix[1]:
dici2.update(ngram.wordNgrams(res[1], j))
freq_term_vector.extend(dici2[gram] if gram in dici2 else 0 for gram in global_combine)
lstm.append(freq_term_vector)
freq_term_matrix = np.matrix(lstm)
transformer = TfidfTransformer(norm="l2")
tfidf = transformer.fit_transform(freq_term_matrix)
X.extend(tfidf.toarray())
X = []
y = []
character = [1,2,3,4,5,6]
word = [1,2,3,4]
mix = [character,word]
global_vector_combine = featureComb(mix, 2, 5000, res1)
print len(global_vector_combine) # 542401
svmCombineVector(mix,global_vector_combine,label,X,y,res1)
clf1 = svm.LinearSVC()
clf1.fit(X, y)
(3)My problem: However, when I ran the source code, a memory error occurred.
Traceback (most recent call last):
File "svm.py", line 110, in <module>
functions.svmCombineVector(mix,global_vector_combine,label,X,y,res1)
File "/home/work/functions.py", line 201, in svmCombineVector
X.extend(tfidf.toarray())
File "/home/anaconda/lib/python2.7/site-packages/scipy/sparse/compressed.py", line 901, in toarray
return self.tocoo(copy=False).toarray(order=order, out=out)
File "/home/anaconda/lib/python2.7/site-packages/scipy/sparse/coo.py", line 269, in toarray
B = self._process_toarray_args(order, out)
File "/home/anaconda/lib/python2.7/site-packages/scipy/sparse/base.py", line 789, in _process_toarray
_args
return np.zeros(self.shape, dtype=self.dtype, order=order)
MemoryError
I really have a hard time with it and need help from stackoverflow.
- Could anyone explain some details and give me some idea how to solve it?
- could anyone check my source code and show me some other methods to make use of memory more effectively?