Current Situation:
I have a function that separates a binary class target variable into "1's" and "0's", it then reads all the independent variable for each. The function also determines the KDE of each of these independent variables based on the class: "1" and "0", then calculates the area of intersection:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
def intersection_area(data, bandwidth, margin,target_variable_name):
#target_variable_name is the column name of the response variable
data = data.dropna()
X = data.drop(columns = [str(target_variable_name)], axis = 1)
names = list(X.columns)
new_columns = []
for column_name in names[:-1]:
x0= data.loc[data[str(target_variable_name)] == 0,str(column_name)]
x1= data.loc[data[str(target_variable_name)] == 1,str(column_name)]
kde0 = gaussian_kde(x0, bw_method=bandwidth)
kde1 = gaussian_kde(x1, bw_method=bandwidth)
x_min = min(x0.min(), x1.min()) #find the lowest value between two minimum points
x_max = min(x0.max(), x1.max()) #finds the lowest value between two maximum points
dx = margin * (x_max - x_min) # add a margin since the kde is wider than the data
x_min -= dx
x_max += dx
x = np.linspace(x_min, x_max, 500)
kde0_x = kde0(x)
kde1_x = kde1(x)
inters_x = np.minimum(kde0_x, kde1_x)
area_inters_x = np.trapz(inters_x, x) #intersection of two kde
print(area_inters_x)
Problem: if I have n_class = 4 the function will look like:
def intersection_area(data, bandwidth, margin,target_variable_name):
#target_variable_name is the column name of the response variable
data = data.dropna()
X = data.drop(columns = [str(target_variable_name)], axis = 1)
names = list(X.columns)
new_columns = []
for column_name in names[:-1]:
x0= data.loc[data[str(target_variable_name)] == 0,str(column_name)]
x1= data.loc[data[str(target_variable_name)] == 1,str(column_name)]
x2= data.loc[data[str(target_variable_name)] == 2,str(column_name)]
x3= data.loc[data[str(target_variable_name)] == 3,str(column_name)]
kde0 = gaussian_kde(x0, bw_method=bandwidth)
kde1 = gaussian_kde(x1, bw_method=bandwidth)
kde2 = gaussian_kde(x2, bw_method=bandwidth)
kde3 = gaussian_kde(x3, bw_method=bandwidth)
x_min = min(x0.min(), x1.min(),x2.min(),x3.min())
x_max = min(x0.max(), x1.max(),x2.min(),x3.min())
dx = margin * (x_max - x_min)
x_min -= dx
x_max += dx
x = np.linspace(x_min, x_max, 500)
kde0_x = kde0(x)
kde1_x = kde1(x)
kde2_x = kde1(x)
kde3_x = kde1(x)
inters_x = np.minimum(kde0_x, kde1_x, kde2_x, kde3_x)
area_inters_x = np.trapz(inters_x, x)
print(area_inters_x)
Now what if I have an unknown dataset with n number of classes? I am trying to improve my old code so that it becomes robust to multiclass datasets, determine the KDE of an indepdendent variable given the class and calculates the intersection of the area. However I am stuck on x = data.loc[data[str(target_name)] == i,str(column_name)]
part:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
def intersection_area(data, bandwidth, margin,target_variable_name):
# Collect the names of the independent variables
data = data.dropna()
X = data.drop(columns = [str(target_variable_name)], axis = 1)
names = list(X.columns)
# determine the number of unique classes from a multi-class and save them as a list.
classes = []
for unique_class in data.target_variable_name.unique():
classes.append(unique_class)
new_columns = []
# for each unique class, run through the different independent variables
for i in classes:
for column_name in names[:-1]:
print(i) #to show the class (target variable: 0,1,...,n)
print(column_name) #to show the variable name to be analyzed
'''This is the part where I got stuck'''
x = data.loc[data[str(target_name)] == i,str(column_name)]
Simulated datasets for anyone interested in replicating the problem:
from sklearn.datasets import make_classification
#note: to create a binary class target change n_class = 2
X,y = make_classification(n_samples=50000, n_features=6,n_informative=6, n_redundant=0, n_repeated=0, n_classes=4
,n_clusters_per_class=3,class_sep=0.95,flip_y=0.2,weights=[0.7,0.2,0.1], shuffle=True,random_state=93)
dataset_x = pd.DataFrame({'var1': X[:, 0], 'var2': X[:, 1],'var3': X[:, 2], 'var4': X[:, 3]
,'var5': X[:, 4], 'var6': X[:, 5]})
dataset_y = pd.DataFrame({'target': y})
sample_dataset = pd.concat([dataset_x,dataset_y], axis=1)
print(sample_dataset)