I'm trying to use CountVectorizer()
with Pipeline
and ColumnTransformer
. Because CountVectorizer()
produces sparse matrix, I used FunctionTransformer
to ensure the ColumnTransformer
can hstack
correctly when putting together the resulting matrix.
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from typing import Callable
# Dataset
df = pd.DataFrame([['a', 'Hi Tom', 'It is hot', 1],
['b', 'How you been Tom', 'hot coffee', 2],
['c', 'Hi you', 'I want some coffee', 3]],
columns=['col_for_ohe', 'col_for_countvectorizer_1', 'col_for_countvectorizer_2', 'num_col'])
# Use FunctionTransformer to ensure dense matrix
def tf_text(X, vectorizer_tf: Callable):
X_vect_ = vectorizer_tf.fit_transform(X)
return X_vect_.toarray()
tf_transformer = FunctionTransformer(tf_text, kw_args={'vectorizer_tf': CountVectorizer()})
# Transformation Pipelines
tf_transformer_pipe = Pipeline(
steps = [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('tf', tf_transformer)])
ohe_transformer_pipe = Pipeline(
steps = [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))])
transformer = ColumnTransformer(transformers=[
('cat_ohe', ohe_transformer_pipe, ['col_for_ohe']),
('cat_tf', tf_transformer_pipe, ['col_for_countvectorizer_1', 'col_for_countvectorizer_2'])
], remainder='passthrough')
transformed_df = transformer.fit_transform(df)
I get AttributeError: 'numpy.ndarray' object has no attribute 'lower.' I've seen this question and suspect CountVectorizer()
is the culprit but not sure how to solve it (previous question doesn't use ColumnTransformer
). I stumbled upon a DenseTransformer
that I wish I could use instead of FunctionTransformer
but unfortunately it is not supported in my company.