2

I have written a python class for preprocessing data before classification modeling. I built it with sklearn style fit/transform functionality wherein lies the rub. After having fit the class object I will need to be able to save it and load it in a production setting.

I am working in Azure Databricks and I have tried using pickle or joblib to save the object to a mounted az gen2 data lake but I run into a 'FileNotFoundError'. I am open to alternatives like MLflow or something else too if possible/necessary.

Does anyone know how I could save this kind of object in Azure Databricks?

import pandas as pd

df_train = spark.read.parquet('BinaryClassigicationData.parquet')
df_train = df_train.toPandas()

woe_encoder = WeightOfEvidenceEncoder()

fit_woe = woe_encoder.fit(data=df_train,
                          y_var='y'
                         )

path_root = '/dbfs:/mnt/deltalake/'
path_models = 'MODELS/'

with open(path_root + path_models + 'WoE_Encoder.joblib', 'wb') as file:
  joblib.dump(imp_set, file)
class WeightOfEvidenceEncoder():
  
'''
  WeightOfEvidenceEncoder calculates the 'weight of evidence' of categorical variable classes and recodes them with those values, converting them to numeric variables before classification modelling. The encoder can also calculate the 'information value' of the categorical variable overall and the classes individually.
  For more info see https://towardsdatascience.com/model-or-do-you-mean-weight-of-evidence-woe-and-information-value-iv-331499f6fc2
'''
  
  def __init__(self):
    self.taught = None
    self.x_var = 'auto'
    
  def get_weight_of_evidence(self, data, dependant_var, independant_var, category):
    
    multi_index = data.groupby([dependant_var, independant_var]).count().iloc[:,0]

    tot_neg = multi_index.xs(False, level='Instructed').sum()
    tot_pos = multi_index.xs(True, level='Instructed').sum()

    cat_neg = multi_index.xs((False, category))
    
    try:
      cat_pos = multi_index.xs((True, category))
    except:
      cat_pos = 0

    perc_neg = cat_neg/tot_neg
    perc_pos = cat_pos/tot_pos

    try:
      weight_of_evidence = np.log(perc_neg/perc_pos)
    except:
      weight_of_evidence = 0

    return weight_of_evidence, perc_neg, perc_pos
  
    
  def get_information_value(self, data, dependant_var, independant_var):
    
    '''
    Information Value | Predictive Power
    ------------------|-----------------
    <0.02             | Terrible
    0.02 - 0.1        | Weak
    0.1 - 0.3         | Medium
    0.3 - 0.5         | Strong
    >0.5              | Fishy
    '''
    
  
    df = data
    
    if df[independant_var].dtype == 'O':

      classes = list(df[independant_var].astype('category').cat.categories)

    elif df[independant_var].dtype == 'bool':

      classes = [True, False]

    else:

      raise ValueError('get_information_value: independant_var must be either string or boolean')
      
    df2 = pd.DataFrame(data=None, 
                        columns=['Class', 'WeightOfEvidence', 'PercNegativeOverlap', 'PercPositiveOverlap']
                       ) 

    for independant_class in classes:

      weight_of_evidence, perc_neg, perc_pos = self.get_weight_of_evidence(data=df,
                                                                           dependant_var=dependant_var,
                                                                           independant_var=independant_var,
                                                                           category=independant_class
                                                                           )

      df2.loc[len(df2.index)] = [independant_class, weight_of_evidence, perc_neg, perc_pos]

    df2 = df2.assign(InformationValue = (df2.PercNegativeOverlap - df2.PercPositiveOverlap)*df2.WeightOfEvidence)
    df2 = (df2.assign(TotalInformationValue = df2.InformationValue.sum())
              .sort_values('InformationValue', ascending=False)
          )            

    return df2
  
  
  def fit(self, data, y_var, x_var='auto', include_bools=False, exclude=[None]):
    
    df = data
    
    if x_var == 'auto':
      if include_bools == True:

        independant_vars = [col for col in df.columns 
                            if ((df[col].dtype == 'O') or (df[col].dtype == 'bool')) 
                            and ((col != y_var) and ~(col in exclude))
                            ]
      else:
        
        independant_vars = [col for col in df.columns if (df[col].dtype == 'O') 
                            and ((col != y_var) and ~(col in exclude))
                           ]
      
    else:
      
      independant_vars = [x_var]
      
    independant_var_information = []
    
    for var in independant_vars:
      
      var_information = self.get_information_value(data=df,
                                                   dependant_var=y_var,
                                                   independant_var=var
                                                   )

      independant_var_information.append(var_information)
      
    self.taught = dict(zip(independant_vars, independant_var_information))
    
    return self
  
  def transform(self, data):
    
    for var in self.taught.keys():
      
      var_data = self.taught[var].replace([np.inf, -np.inf], 0)
      var_data = var_data.set_index('Class')
      
      data[var] = data[var].map(var_data['WeightOfEvidence'])
      
    return data
  
  def fit_transform(self, data, y_var, x_var='auto', include_bools=False, exclude=[None]):
    
    df = data
    
    out = (self.fit(data=df, 
                    y_var=y_var, 
                    x_var=x_var, 
                    include_bools=include_bools, 
                    exclude=exclude
                    )
               .transform(data=df)
          )
    
    return out
Alex Ott
  • 80,552
  • 8
  • 87
  • 132

1 Answers1

0

The main problem is that you're trying to use Python's local API like open with DBFS URLs - Python doesn't know about this filesystem. The solution is to use local DBFS mount available on /dbfs (works only if you're not on Databricks Community Edition). So change the code to

path_root = '/dbfs/mnt/deltalake/'

Also, you may look onto custom models in the MLflow - they are specifically designed to have custom code applied to the data when doing inference. You can find more details in this answer.

P.S. On Databricks Community you can just write onto a local disk, and then use dbutils.fs.cp to copy to DBFS (see this answer for more details)

Alex Ott
  • 80,552
  • 8
  • 87
  • 132