I wrote multiple steps to impute a dataset, and I want to pickle/save these steps so that it can be loaded and used automatically when analyzing a new sample.
The steps I did for imputation are:
imputer = MissForest()
imputed_data = imputer.fit_transform(data)
imputed_data = pd.DataFrame(imputed_data, columns=data.columns)
#Drop 'id'
imputed_data_initial = imputed_data.drop('id', axis = 1)
#Get unique values
def get_unique_values(col_name):
return data[col_name].dropna().unique().tolist()
#Find closest distance
def find_closest_value(target, unique_values):
chosen = unique_values[0]
L2 = (target - chosen) ** 2
for value in unique_values:
if (target - value) ** 2 < L2:
chosen = value
L2 = (target - chosen) ** 2
return chosen
#Imputation
for col_name in columns_name_lst:
columns_name_lst = imputed_data.columns
row_count = len(imputed_data)
unique_values = get_unique_values(col_name)
if len(unique_values) < 2000:
for i in range(row_count):
target = imputed_data.iloc[i][col_name]
imputed_data.iloc[i][col_name] = find_closest_value(target, unique_values)
I want to pickle all these steps as a whole. What're ways I can do in python? Thanks!