1

I am trying to use Age and Gender to predict Med, but I am new to Pipeline and FeatureUnion of Scikit-learn, and encountered some issue. I read through some tutorial and answer, and that's how I wrote the codes below, but I don't have a good grasp on how to feed the split data into the pipeline functions.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix

# Import data into Pandas data frame
data_directory = 'C:/Users/Asus/'
file_name = 'Example.csv'

df = pd.read_csv(data_directory + file_name)
df_len = len(df)

# Get a lit of all variables
print (list(df))

# Class that identifies Column type
class Columns(BaseEstimator, TransformerMixin):
    def __init__(self, names=None):
        self.names = names
    def fit (self, X, y=None, **fit_params):
        return self
    def transform(self, X):
        return X[self.names]

numeric = [] # list of numeric column names
categorical = [] # list of categorical column names

# Creating random subsample for fast model building
def sample_n(df, n, replace=False, weight=None, seed=None):
    """Sample n rows from a DataFrame at random"""
    rs = np.random.RandomState(seed)
    locs = rs.choice(df.shape[0], size=n, replace=replace, p=weight)
    return df.take(locs, axis=0)

df = sample_n(df, n=300, seed=1123)

# Merge FG-LAI, SG-LAI and Both-LAI together into one group (MED=3)
df.ix[(df['MED']==4)|(df['MED']==5), 'MED']=3

# Remove No-Med (MED=1) and Both-LAI (MED=5) cases
df = df.drop(df[(df['MED']==1)|(df['MED']==5)].index)

# Separate target from training features
y = df['MED']
X = df.drop('MED', axis=1)

# Retain only the needed predictors
X = X.filter(['age', 'gender'])

# Find the numerical columns, exclude categorical columns
X_num_cols = X.columns[X.dtypes.apply(lambda c: np.issubdtype(c, np.number))]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.5, 
                                                    random_state=567, 
                                                    stratify=y)

# Pipeline
pipe = Pipeline([
    ("features", FeatureUnion([
        ('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
        ('categorical', make_pipeline(Columns(names=categorical),OneHotEncoder(sparse=False)))
    ])),
    ('model', LogisticRegression())
])

# Declare hyperparameters
hyperparameters = {'logisticregression__c' : [0.01, 0.1, 1.0, 10.0],
                    'logisticregression__penalty' : ['l1', 'l2'],
                    'logisticregression__multi_class': ['ovr'],
                    'logisticregression__class_weight': ['balanced', None],
                    }

# SKlearn cross-validation with pipeline
clf = GridSearchCV(pipe, hyperparameters, cv=10)

# Fit and tune model
clf.fit(X_train, y_train)

Errors:

ValueError: Invalid parameter logisticregression for estimator Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('numeric', Pipeline(memory=None,
     steps=[('columns', Columns(names=[])), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('categorical', Pipeline(memory=None,
     steps=[('columns', Columns(nam...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]). Check the list of available parameters with `estimator.get_params().keys()`.

Edits:

print (pipe.get_params().keys())

gives

dict_keys(['memory', 'steps', 'features', 'LR_model', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__numeric', 'features__categorical', 'features__numeric__memory', 'features__numeric__steps', 'features__numeric__columns', 'features__numeric__standardscaler', 'features__numeric__columns__names', 'features__numeric__standardscaler__copy', 'features__numeric__standardscaler__with_mean', 'features__numeric__standardscaler__with_std', 'features__categorical__memory', 'features__categorical__steps', 'features__categorical__columns', 'features__categorical__onehotencoder', 'features__categorical__columns__names', 'features__categorical__onehotencoder__categorical_features', 'features__categorical__onehotencoder__dtype', 'features__categorical__onehotencoder__handle_unknown', 'features__categorical__onehotencoder__n_values', 'features__categorical__onehotencoder__sparse', 'LR_model__C', 'LR_model__class_weight', 'LR_model__dual', 'LR_model__fit_intercept', 'LR_model__intercept_scaling', 'LR_model__max_iter', 'LR_model__multi_class', 'LR_model__n_jobs', 'LR_model__penalty', 'LR_model__random_state', 'LR_model__solver', 'LR_model__tol', 'LR_model__verbose', 'LR_model__warm_start'])

After changing into 'model__', I am getting the new error:

ValueError: Found array with 0 feature(s) (shape=(109, 0)) while a minimum of 1 is required by StandardScaler.

Edits 2:

# Retain only the needed predictors
#X = X.filter(['age', 'ccis', 'num_claims', 'Prior_DIH', 'prior_ED_num'])
X_selected = X.filter(['age', 'Geo', 'ccis', 'num_claims', 'Prior_DIH', 'prior_ED_num',
    'DAD_readmit', 'Num_DAD_readmit', 'ED_readmit', 'NUmber_ED_readmit'
    'Fail_renew', 'FR_num'])

# from the selected X, further choose categorical only
X_selected_cat = X_selected.filter(['Geo', 'ccis']) # hand selected since some cat var has value 0, 1

# Find the numerical columns, exclude categorical columns
X_num_cols = X_selected.columns[X_selected.dtypes.apply(lambda c: np.issubdtype(c, np.number))] # list of numeric column names, automated here
X_cat_cols = X_selected_cat.columns # list of categorical column names, previously hand-slected

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, 
                                                    test_size=0.5, 
                                                    random_state=567, 
                                                    stratify=y)

# Pipeline
pipe = Pipeline([
    ("features", FeatureUnion([
        ('numeric', make_pipeline(Columns(names=X_num_cols),StandardScaler())),
        ('categorical', make_pipeline(Columns(names=X_cat_cols),OneHotEncoder(sparse=False)))
    ])),
    ('LR_model', LogisticRegression())
])

Errors:

ValueError: could not convert string to float: 'Urban'
KubiK888
  • 4,377
  • 14
  • 61
  • 115
  • Have you printed "pipe.get_params().keys()" to see what keys you have available, as the message suggests? As you used the Pipeline API explicity, "model" is the key for your logistic regression (whereas "make_pipeline" chooses it for you). So for you, the key for the parameters should be "model__c" and so on. – Marcus V. Feb 26 '18 at 22:13
  • Thanks Marcus, I have made the changes, but now encountered a new error. – KubiK888 Feb 26 '18 at 22:45
  • Please post some lines or upload the example.csv file – Vivek Kumar Feb 27 '18 at 05:00
  • It looks like you are passing the empty lists "numeric" and "categorical" instead of for instance "X_num_cols" which seems to be your list of numeric columns. – Marcus V. Feb 27 '18 at 13:57
  • Thanks Marcus, I think I am getting closer and closer. I made changes based on your suggestions, it seems the numeric variables are fine now. But now I am getting some error with the categorical variables. Do I need to add labelbinarizer before onehotencoder? If so, where should I insert the code? – KubiK888 Feb 27 '18 at 16:54

1 Answers1

1

The input array of OneHotEncoder is int but you provided string to it. You could use LabelEncoder or LabelBinarizer to convert string to int. Then, you will be allowed to use OneHotEncoder.

pipe = Pipeline([
    ("features", FeatureUnion([
        ('numeric', make_pipeline(Columns(names=X_num_cols),StandardScaler())),
        ('categorical', make_pipeline(Columns(names=X_cat_cols),LabelEncoder(), OneHotEncoder(sparse=False)))
    ])),
    ('LR_model', LogisticRegression())
])
ebrahimi
  • 912
  • 2
  • 13
  • 32