I'm new to the sk-learn pipeline and would like use my own form of discretized binning. I need to bin a column of values based on the cumulative sum of another column associated with the original column. I have a working function:
def dynamic_bin(df, column, weight, minimum):
"""
Parameters
----------
df : dataframe
column : column to be binned
weight : column that will dictate the bin
minimum : minimum weight per bin
Returns
-------
df : dataframe with new binned column
"""
bins = [-np.inf]
labels = []
hold_over = []
for i in sorted(df[column].unique()):
g = df[df[column] == i].groupby(column).agg({weight:'sum'}).reset_index()
if g[weight].values[0] < minimum:
if hold_over is None:
hold_over.append(g[weight].values[0])
elif (sum(hold_over) + g[weight].values[0]) < minimum:
hold_over.append(g[weight].values[0])
elif (sum(hold_over) + g[weight].values[0]) >= minimum:
hold_over.clear()
bins.append(g[column].values[0])
labels.append(g[column].values[0])
elif g[weight].values[0] >= minimum:
bins.append(g[column].values[0])
labels.append(g[column].values[0])
bins.pop()
bins.append(np.inf)
str_column = str(column)+str("_binned")
# print(str_column)
df[str_column] = pd.cut(df[column],
bins = bins,
labels = labels)
return df
This is how I tried to make it a class.
from sklearn.base import BaseEstimator, TransformerMixin
class dynamic_bin(BaseEstimator, TransformerMixin):
def __init__(self, weight, minimum):
self.weight = weight
self.minimum = minimum
def fit(self, X, y=None):
return self
def tranform(self, X):
"""
Parameters
----------
df : dataframe
column : column to be binned
weight : column that will dictate the bin
minimum : minimum weight per bin
Returns
-------
df : dataframe with new binned column
"""
bins = [-np.inf]
labels = []
hold_over = []
for i in sorted(df[column].unique()):
g = df[df[column] == i].groupby(column).agg({weight:'sum'}).reset_index()
if g[weight].values[0] < minimum:
if hold_over is None:
hold_over.append(g[weight].values[0])
elif (sum(hold_over) + g[weight].values[0]) < minimum:
hold_over.append(g[weight].values[0])
elif (sum(hold_over) + g[weight].values[0]) >= minimum:
hold_over.clear()
bins.append(g[column].values[0])
labels.append(g[column].values[0])
elif g[weight].values[0] >= minimum:
bins.append(g[column].values[0])
labels.append(g[column].values[0])
bins.pop()
bins.append(np.inf)
str_column = str(column)+str("_binned")
# print(str_column)
df[str_column] = pd.cut(df[column],
bins = bins,
labels = labels)
return df[str_column]
When I try to implement it the following way, i get the error underneath it:
column_trans = ColumnTransformer(
[
("binned_numeric", dynamic_bin(weight = 'Exposure', minimum = 1000),
["VehAge", "DrivAge"]),
("onehot_categorical", OneHotEncoder(),
["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
("passthrough_numeric", "passthrough",
["BonusMalus"]),
("log_scaled_numeric", log_scale_transformer,
["Density"]),
],
remainder="drop",
)
X = column_trans.fit_transform(df)
TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. 'dynamic_bin(minimum=1000, weight='Exposure')' (type <class 'dynamic_bin.dynamic_bin'>) doesn't.
I read the following but I don't really follow it.
Put customized functions in Sklearn pipeline
Does anyone spot the mistake i've made?