0

How can I create a custom Pipeline in python? I tried with sklearn pipeline but seems it not running successfully. Mostly I need my pre-process as a customize pipeline with a logistics model.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

path = 'C:/Users/Desktop/'
df = pd.read_excel (path + "df.xlsx", sheet_name='df')

# import the BaseEstimator
from sklearn.base import BaseEstimator

# define the class OutletTypeEncoder
# custom transformer must have methods fit and transform
class OutletTypeEncoder(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, documents, y=None):
        return self

    def transform(self, df):
        
        # replace NaN
        df[['pdf_tbl_pn_identifier', 'pdf_tbl_qty_identifier', 'pdf_header_present']] = df[['pdf_tbl_pn_identifier', 'pdf_tbl_qty_identifier', 'pdf_header_present']].fillna(value=-999)
        df[['pdf_tbl_cnt']] = df[['pdf_tbl_cnt']].fillna(value=0)

        # Replace gt 1 count as 0
        df['pdf_tbl_cnt'] = np.where( ( df['pdf_tbl_cnt'] == '1'), 1, 0)
        df['part_cnt'] = np.where( (df['part_cnt'] == '1'),  1, 0)

        # create numeric and categorica coulmns
        obj_df= df[['pdf_tbl_pn_identifier','pdf_tbl_qty_identifier','pdf_header_present',
                    'pdf_body_pn_identifier','pdf_body_qty_identifier','pdf_model_rel_returned','pdf_model_ent_returned']]
        num_df= df[['pdf_tbl_cnt', 'pdf_model_avg_relationship_score','pdf_model_avg_entity_score','part_cnt','matching']]

        # Labelencoding for categorica columns and then 
        obj_df=obj_df.apply(LabelEncoder().fit_transform)
        df = pd.concat([obj_df, num_df], axis=1)
        df.reset_index(inplace=True, drop=True)

        df.pdf_tbl_pn_identifier = df.pdf_tbl_pn_identifier.astype(str)
        df.pdf_tbl_qty_identifier = df.pdf_tbl_qty_identifier.astype(str)
        df.pdf_body_pn_identifier = df.pdf_body_pn_identifier.astype(str)
        df.pdf_body_qty_identifier = df.pdf_body_qty_identifier.astype(str)
        df.pdf_model_rel_returned = df.pdf_model_rel_returned.astype(str)
        df.pdf_model_ent_returned = df.pdf_model_ent_returned.astype(str)
        df.pdf_header_present = df.pdf_header_present.astype(str)
        df.matching = df.matching.astype(str)
        #df['pdf_tbl_cnt'] = df['pdf_tbl_cnt'].apply(np.int64) 
        df.pdf_tbl_cnt = df.pdf_tbl_cnt.apply(np.int64) 

        return df

feature_cols = df.drop(['matching'], axis=1)
X = feature_cols # Features
y = df.matching # Target variable

# split into train test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)


logreg = LogisticRegression()
model_pipeline = Pipeline(steps=[('preprocess', OutletTypeEncoder()), 
                                 ('logreg', LogisticRegression())
                                 ])

# fit the pipeline with the training data
model_pipeline.fit(X_train,y_train)

I am getting error as below. Plz help me out

UnboundLocalError: local variable 'df' referenced before assignment

0 Answers0