import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
data = [[1, 3, 4, 'text', 'pos'], [9, 3, 6, 'text more', 'neg']]
data = pd.DataFrame(data, columns=['Num1', 'Num2', 'Num3', 'Text field', 'Class'])
tweet_text_transformer = Pipeline(steps=[
('count_vectoriser', CountVectorizer()),
('tfidf', TfidfTransformer())
])
numeric_transformer = Pipeline(steps=[
('scaler', MinMaxScaler())
])
preprocessor = ColumnTransformer(transformers=[
# (name, transformer, column(s))
('tweet', tweet_text_transformer, ['Text field']),
('numeric', numeric_transformer, ['Num1', 'Num2', 'Num3'])
])
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LinearSVC())
])
X_train = data.loc[:, 'Num1':'Text field']
y_train = data['Class']
pipeline.fit(X_train, y_train)
I don't understand where this error is coming from:
ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 1 has size 2