I want to keep only top K values in each row of sparse matrix. Here is the code I use for that:
def _top_k(args, k):
"""
Helper function to process a single row of top_k
"""
data, row = args
data, row = zip(*sorted(zip(data, row), reverse=True)[:k])
return data, row
# https://stackoverflow.com/a/49142237/4870273
# keep only top k values of sparse matrix m, to drop the non important keywords from tf-idf matrix
def top_k_values_of_sparse_matrix(m, k):
"""
Keep only the top k elements of each row in a csr_matrix
"""
ml = m.tolil()
with Pool() as p:
ms = p.map(functools.partial(_top_k, k=k), zip(ml.data, ml.rows))
ml.data, ml.rows = zip(*ms)
return ml.tocsr()
I use it in pipeline as custom transformer. It seems to work as some solo example:
f = FunctionTransformer(functools.partial(top_k_values_of_sparse_matrix, k=3), accept_sparse=True)
a = f.transform(csr_matrix([1,2,3,4,5]))
print(a.toarray())
Output is as below which is top 3 values of the row:
[[0 0 3 4 5]]
But when I use it in my code:
text_clf = Pipeline([
('tfidf', TfidfVectorizer(preprocessor=preprocess_text, max_df=0.1, min_df=1, max_features=n_features, stop_words=stop_words, ngram_range=(1,3))),
('tra1', FunctionTransformer(functools.partial(top_k_values_of_sparse_matrix, k=50), accept_sparse=True)),
('clf', SGDClassifier(loss='modified_huber', penalty='l2', alpha=1e-4, random_state=random_state, max_iter=5, tol=None)),
])
text_clf.fit(dataset.data[:n_samples], dataset.target[:n_samples])
It throws the error:
Traceback (most recent call last): File "/usr/local/Cellar/python/3.7.6_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/pool.py", line 121, in worker result = (True, func(*args, **kwds)) File "/usr/local/Cellar/python/3.7.6_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar return list(map(*args)) File "/Users/admin/Projects/Python/xy.py", line 50, in _top_k data, row = zip(*sorted(zip(data, row), reverse=True)[:k]) ValueError: not enough values to unpack (expected 2, got 0)
pointing at this row in _top_k method:
data, row = zip(*sorted(zip(data, row), reverse=True)[:k])
Edit: hm the only explanation could be that some matrix is full zero empty? which is strange.. but i've simply put the try catch block in the function and for other rows it at least works.