I have written a python class for preprocessing data before classification modeling. I built it with sklearn style fit/transform functionality wherein lies the rub. After having fit the class object I will need to be able to save it and load it in a production setting.
I am working in Azure Databricks and I have tried using pickle or joblib to save the object to a mounted az gen2 data lake but I run into a 'FileNotFoundError'. I am open to alternatives like MLflow or something else too if possible/necessary.
Does anyone know how I could save this kind of object in Azure Databricks?
import pandas as pd
df_train = spark.read.parquet('BinaryClassigicationData.parquet')
df_train = df_train.toPandas()
woe_encoder = WeightOfEvidenceEncoder()
fit_woe = woe_encoder.fit(data=df_train,
y_var='y'
)
path_root = '/dbfs:/mnt/deltalake/'
path_models = 'MODELS/'
with open(path_root + path_models + 'WoE_Encoder.joblib', 'wb') as file:
joblib.dump(imp_set, file)
class WeightOfEvidenceEncoder():
'''
WeightOfEvidenceEncoder calculates the 'weight of evidence' of categorical variable classes and recodes them with those values, converting them to numeric variables before classification modelling. The encoder can also calculate the 'information value' of the categorical variable overall and the classes individually.
For more info see https://towardsdatascience.com/model-or-do-you-mean-weight-of-evidence-woe-and-information-value-iv-331499f6fc2
'''
def __init__(self):
self.taught = None
self.x_var = 'auto'
def get_weight_of_evidence(self, data, dependant_var, independant_var, category):
multi_index = data.groupby([dependant_var, independant_var]).count().iloc[:,0]
tot_neg = multi_index.xs(False, level='Instructed').sum()
tot_pos = multi_index.xs(True, level='Instructed').sum()
cat_neg = multi_index.xs((False, category))
try:
cat_pos = multi_index.xs((True, category))
except:
cat_pos = 0
perc_neg = cat_neg/tot_neg
perc_pos = cat_pos/tot_pos
try:
weight_of_evidence = np.log(perc_neg/perc_pos)
except:
weight_of_evidence = 0
return weight_of_evidence, perc_neg, perc_pos
def get_information_value(self, data, dependant_var, independant_var):
'''
Information Value | Predictive Power
------------------|-----------------
<0.02 | Terrible
0.02 - 0.1 | Weak
0.1 - 0.3 | Medium
0.3 - 0.5 | Strong
>0.5 | Fishy
'''
df = data
if df[independant_var].dtype == 'O':
classes = list(df[independant_var].astype('category').cat.categories)
elif df[independant_var].dtype == 'bool':
classes = [True, False]
else:
raise ValueError('get_information_value: independant_var must be either string or boolean')
df2 = pd.DataFrame(data=None,
columns=['Class', 'WeightOfEvidence', 'PercNegativeOverlap', 'PercPositiveOverlap']
)
for independant_class in classes:
weight_of_evidence, perc_neg, perc_pos = self.get_weight_of_evidence(data=df,
dependant_var=dependant_var,
independant_var=independant_var,
category=independant_class
)
df2.loc[len(df2.index)] = [independant_class, weight_of_evidence, perc_neg, perc_pos]
df2 = df2.assign(InformationValue = (df2.PercNegativeOverlap - df2.PercPositiveOverlap)*df2.WeightOfEvidence)
df2 = (df2.assign(TotalInformationValue = df2.InformationValue.sum())
.sort_values('InformationValue', ascending=False)
)
return df2
def fit(self, data, y_var, x_var='auto', include_bools=False, exclude=[None]):
df = data
if x_var == 'auto':
if include_bools == True:
independant_vars = [col for col in df.columns
if ((df[col].dtype == 'O') or (df[col].dtype == 'bool'))
and ((col != y_var) and ~(col in exclude))
]
else:
independant_vars = [col for col in df.columns if (df[col].dtype == 'O')
and ((col != y_var) and ~(col in exclude))
]
else:
independant_vars = [x_var]
independant_var_information = []
for var in independant_vars:
var_information = self.get_information_value(data=df,
dependant_var=y_var,
independant_var=var
)
independant_var_information.append(var_information)
self.taught = dict(zip(independant_vars, independant_var_information))
return self
def transform(self, data):
for var in self.taught.keys():
var_data = self.taught[var].replace([np.inf, -np.inf], 0)
var_data = var_data.set_index('Class')
data[var] = data[var].map(var_data['WeightOfEvidence'])
return data
def fit_transform(self, data, y_var, x_var='auto', include_bools=False, exclude=[None]):
df = data
out = (self.fit(data=df,
y_var=y_var,
x_var=x_var,
include_bools=include_bools,
exclude=exclude
)
.transform(data=df)
)
return out