I'm working on a credit card fraud detection problem, while constructing the model, I encoded the categorical columns using K-Folded Target Encoding, the code of the encoding will be down bellow, after training testing (the testing dataset was encoded the same way as the training datasetset) and evaluation I got my model. now I have a platform where the user gives me inputs, and I should give him a prediction, the input contains categorical values, which means, the user will give me one observation, and I need to pass it to the model to have prediction.
I need to encode the categorical variables the same way, how can I do that?
the K-Fold Target Encoding Code:
import category_encoders
from sklearn import base
from sklearn.model_selection import KFold
class KFoldTargetEncoderTrain(base.BaseEstimator,base.TransformerMixin):
def __init__(self,colnames,targetName,
n_fold=5, verbosity=True,
discardOriginal_col=False):
self.colnames = colnames
self.targetName = targetName
self.n_fold = n_fold
self.verbosity = verbosity
self.discardOriginal_col = discardOriginal_col
def fit(self, X, y=None):
return self
def transform(self,X):
mean_of_target = X[self.targetName].mean()
kf = KFold(n_splits = self.n_fold,
shuffle = True, random_state=2019)
print(kf)
col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
X[col_mean_name] = np.nan
for tr_ind, val_ind in kf.split(X):
X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
#train set and validation set
X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)
[self.targetName].mean())
#test set you take the average of the target values of all samples that have a given category in the entire train set.
X[col_mean_name].fillna(mean_of_target, inplace = True)
if self.verbosity:
encoded_feature = X[col_mean_name].values
print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,self.targetName,
np.corrcoef(X[self.targetName].values,
encoded_feature)[0][1]))
if self.discardOriginal_col:
X = X.drop(self.targetName, axis=1)
return X
This is how the model expect the inputs will be:
sample3 = {}
sample3['amt'] = 116.460000
sample3['lat'] = 36.055700
sample3['long'] = -96.060200
sample3['city_pop'] = 413574.000000
sample3['hour'] = 3.000000
sample3['age_at_transaction'] = 49.000000
sample3['merchant_Kfold_Target_Enc'] = 0.001883
sample3['city_Kfold_Target_Enc'] = 0.000000
sample3['state_Kfold_Target_Enc'] = 0.004525
sample3['job_Kfold_Target_Enc'] = 0.000000
sample3['category_entertainment'] = 0.000000
sample3['category_food_dining'] = 0.000000
sample3['category_gas_transport'] = 0.000000
sample3['category_grocery_net'] = 0.000000
sample3['category_grocery_pos'] = 0.000000
sample3['category_health_fitness'] = 0.000000
sample3['category_home'] = 0.000000
sample3['category_kids_pets'] = 0.000000
sample3['category_misc_net'] = 0.000000
sample3['category_misc_pos'] = 1.000000
sample3['category_personal_care'] = 0.000000
sample3['category_shopping_net'] = 0.000000
sample3['category_shopping_pos'] = 0.000000
sample3['category_travel'] = 0.000000
sample3['year'] = 2020.000000
sample3['month'] = 12.000000
sample3['day'] = 21.000000
sample3 = pd.DataFrame([sample3])
and this is how I get my inputs from user:
sample2 = {}
sample2['age_at_transaction'] = 25
sample2['amt'] = 2500
sample2['category_entertainment'] = 0
sample2['category_food_dining'] = 0
sample2['category_gas_transport'] = 0
sample2['category_grocery_net'] = 0
sample2['category_grocery_pos'] = 0
sample2['category_health_fitness'] = 0
sample2['category_home'] = 0
sample2['category_kids_pets'] = 0
sample2['category_misc_net'] = 0
sample2['category_misc_pos'] = 0
sample2['category_personal_care'] = 0
sample2['category_shopping_net'] = 0
sample2['category_shopping_pos'] = 1
sample2['category_travel'] = 0
sample2['city'] = "San Jose"
sample2['city_pop'] = 320000
sample2['trans_date'] = "2018-12-12 12"
sample2['job'] = "Engineering"
sample2['merchant'] = "Kirlin and Sons"
sample2['state'] = "CA"
sample2['lat'] = 36.055700
sample2['long'] = -96.060200
sample2 = pd.DataFrame([sample2]