1

Current Situation:

I have a function that separates a binary class target variable into "1's" and "0's", it then reads all the independent variable for each. The function also determines the KDE of each of these independent variables based on the class: "1" and "0", then calculates the area of intersection:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

def intersection_area(data, bandwidth, margin,target_variable_name):
        #target_variable_name is the column name of the response variable
        data = data.dropna()
        X = data.drop(columns = [str(target_variable_name)], axis = 1)
        names = list(X.columns)
        new_columns = []
        for column_name in names[:-1]:
            x0= data.loc[data[str(target_variable_name)] == 0,str(column_name)]
            x1= data.loc[data[str(target_variable_name)] == 1,str(column_name)]
            
            kde0 = gaussian_kde(x0, bw_method=bandwidth)
            kde1 = gaussian_kde(x1, bw_method=bandwidth)
            x_min = min(x0.min(), x1.min()) #find the lowest value between two minimum points
            x_max = min(x0.max(), x1.max()) #finds the lowest value between two maximum points
            dx = margin * (x_max - x_min) # add a margin since the kde is wider than the data
            x_min -= dx
            x_max += dx
        
            x = np.linspace(x_min, x_max, 500)
            kde0_x = kde0(x)
            kde1_x = kde1(x)
            inters_x = np.minimum(kde0_x, kde1_x)
            area_inters_x = np.trapz(inters_x, x) #intersection of two kde
            print(area_inters_x)

Problem: if I have n_class = 4 the function will look like:

def intersection_area(data, bandwidth, margin,target_variable_name):
        #target_variable_name is the column name of the response variable
        data = data.dropna()
        X = data.drop(columns = [str(target_variable_name)], axis = 1)
        names = list(X.columns)
        new_columns = []
        for column_name in names[:-1]:
            x0= data.loc[data[str(target_variable_name)] == 0,str(column_name)]
            x1= data.loc[data[str(target_variable_name)] == 1,str(column_name)]
            x2= data.loc[data[str(target_variable_name)] == 2,str(column_name)]
            x3= data.loc[data[str(target_variable_name)] == 3,str(column_name)]
            
            kde0 = gaussian_kde(x0, bw_method=bandwidth)
            kde1 = gaussian_kde(x1, bw_method=bandwidth)
            kde2 = gaussian_kde(x2, bw_method=bandwidth)
            kde3 = gaussian_kde(x3, bw_method=bandwidth)
            x_min = min(x0.min(), x1.min(),x2.min(),x3.min())
            x_max = min(x0.max(), x1.max(),x2.min(),x3.min())

            dx = margin * (x_max - x_min)
            x_min -= dx
            x_max += dx
        
            x = np.linspace(x_min, x_max, 500)
            kde0_x = kde0(x)
            kde1_x = kde1(x)
            kde2_x = kde1(x)
            kde3_x = kde1(x)
            inters_x = np.minimum(kde0_x, kde1_x, kde2_x, kde3_x)
            area_inters_x = np.trapz(inters_x, x)
            print(area_inters_x)

Now what if I have an unknown dataset with n number of classes? I am trying to improve my old code so that it becomes robust to multiclass datasets, determine the KDE of an indepdendent variable given the class and calculates the intersection of the area. However I am stuck on x = data.loc[data[str(target_name)] == i,str(column_name)] part:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

def intersection_area(data, bandwidth, margin,target_variable_name):
        # Collect the names of the independent variables
        data = data.dropna()
        X = data.drop(columns = [str(target_variable_name)], axis = 1)
        names = list(X.columns)
       
        # determine the number of unique classes from a multi-class and save them as a list.
        classes = []
        for unique_class in data.target_variable_name.unique():
            classes.append(unique_class)

        new_columns = []    
        # for each unique class, run through the different independent variables
        for i in classes:
            for column_name in names[:-1]:
                print(i) #to show the class (target variable: 0,1,...,n)
                print(column_name) #to show the variable name to be analyzed
                '''This is the part where I got stuck'''
                x = data.loc[data[str(target_name)] == i,str(column_name)] 

Simulated datasets for anyone interested in replicating the problem:

from sklearn.datasets import make_classification
#note: to create a binary class target change n_class = 2

X,y = make_classification(n_samples=50000, n_features=6,n_informative=6, n_redundant=0, n_repeated=0, n_classes=4
                          ,n_clusters_per_class=3,class_sep=0.95,flip_y=0.2,weights=[0.7,0.2,0.1], shuffle=True,random_state=93)

dataset_x = pd.DataFrame({'var1': X[:, 0], 'var2': X[:, 1],'var3': X[:, 2], 'var4': X[:, 3]
                        ,'var5': X[:, 4], 'var6': X[:, 5]})

dataset_y = pd.DataFrame({'target': y})

sample_dataset = pd.concat([dataset_x,dataset_y], axis=1)
print(sample_dataset)
Gerard
  • 518
  • 4
  • 19
  • 1
    Can you provide a sample of data frame for [reproducible example](https://stackoverflow.com/q/20109391/1422451)? – Parfait Aug 30 '20 at 15:05
  • @Parfait, provided a simulated sample, thanks for the help. – Gerard Aug 31 '20 at 11:21
  • Is your *class* column alluded to in second, attempted code block the same as `target_variable_name`? – Parfait Aug 31 '20 at 14:36
  • Yes, target_variable_name and class column are the same. thanks for the help. – Gerard Aug 31 '20 at 14:44
  • Hmmm...So target will have more than 0 and 1 values which you use to split each column series for KDE comparison per your older code? And so you want all possible pairwise combinations? – Parfait Aug 31 '20 at 14:47
  • You made me realize that as the number of classes increases its no longer a pairwise combination since as the number of classes increases so does the number of kde's such that: inters_x = np.minimum(kde0_x, kde1_x,....,kden_x). Thank for raising this. – Gerard Sep 01 '20 at 05:43
  • Understood better now. I revamped my answer to build x's and kde's using lists I come to find out `np.minimum` only supports two arrays at a time but there are [other methods](https://stackoverflow.com/q/39277638/1422451) for minimum of multiple arrays. – Parfait Sep 01 '20 at 14:18

1 Answers1

1

Consider building list of x's and kde's using list comprehension for multiple classes per target level. And instead of printing out the result in each iteration, bind results into a data frame:

def intersection_area_new(data, bandwidth, margin, target_variable_name):
        # Collect the names of the independent variables
        data = data.dropna()
        
        # determine the number of unique classes from a multi-class target variable and save them as a list.
        classes = data['target'].unique()
        
        kde_dicts = []
        for column_name in data.columns[:-1]:
            # BUILD LIST OF x's AND kde's
            x_s = [data.loc[(data[target_variable_name] == i), str(column_name)] for i in classes]
            kde_s = [gaussian_kde(x, bw_method=bandwidth) for x in x_s]
            
            x_min = min([x.min() for x in x_s])              # find the lowest value between two minimum points
            x_max = min([x.max() for x in x_s])              # find the lowest value between two maximum points
                            
            dx = margin * (x_max - x_min)                    # add a margin since the kde is wider than the data
            x_min -= dx
            x_max += dx
    
            x_array = np.linspace(x_min, x_max, 500)
            kde_x_s = [kde(x_array) for kde in kde_s]
                        
            inters_x = np.array(kde_x_s).min(axis=0)
            area_inters_x = np.trapz(inters_x, x_array)      # intersection of kdes
            
            kde_dicts.append({'target': target_variable_name, 
                              'column': column_name,
                              'intersection': area_inters_x})
        
        return pd.DataFrame(kde_dicts)

Output

output = intersection_area_new(sample_dataset, None, 0.5, "target")
print(output.head(10))

#    target column  intersection
# 0  target   var1      0.842256
# 1  target   var2      0.757190
# 2  target   var3      0.676021
# 3  target   var4      0.873074
# 4  target   var5      0.763626
# 5  target   var6      0.868560
Parfait
  • 104,375
  • 17
  • 94
  • 125