I tried to normalize the data by using Gaussian function 2 times on both positive and negative numbers of each parameter of this dataset. The dataset includes missing data as well. The problem is I want to highlight outliers via scatter graph by using cmap='coolwarm'
for parameters A, B and specifically T so that:
- outliers outside of that interval can be marked by
(x)
or(*)
withcmap='coolwarm'
- on the right side of the graph
cbar
is suppose to be available. - my aim is to highlight them in an elegant way before applying cleaning data then compare the raw data and processed data before & after graphs in the form of the subplot in one page.
- Is it possible to highlight outliers by
from sklearn.neighbors import LocalOutlierFactor
? or defineingVmin
andVmax
inspiring from this answer or should I flag outliers before highlighting byBoolean masking
(for the sake of learning) or define the function to detect them. my used code to color up outliers as follows:
def normalize(value, min_value, max_value, min_norm, max_norm):
new_value = ((max_norm - min_norm)*((value - min_value)/(max_value - min_value))) + min_norm
return new_value
def outlier_fix(data, _min, _max):
for i in range (0, data.size):
if (data.iat[i] > _max):
data.iat[i] = _max
if (data.iat[i] < _min):
data.iat[i] = _min
return data
def createpositiveandnegativelist(listtocreate):
l_negative = []
l_positive = []
for value in listtocreate:
if (value <= 0):
l_negative.append(value)
elif (value > 0):
l_positive.append(value)
#print(t_negative)
#print(t_positive)
return l_negative,l_positive
def calculatemean(listtocalculate):
return sum(listtocalculate)/len(listtocalculate)
def plotboundedCI(s, mu, sigma, lists):
plt.figure()
'''
print("\nS:\n",s)
print("\nmuuu:\n",mu)
print("\nsigma:\n",sigma)
'''
count, bins, ignored = plt.hist(s,30,density=True)
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) * np.exp(-(bins-mu)**2/(2*sigma**2)),linewidth=2, color= 'r')
#confidential interval calculation
ci = scipy.stats.norm.interval(0.68, loc = mu, scale = sigma)
#confidence interval for left line
one_x12, one_y12 = [ci[0],ci[0]], [0,3]
#confidence interval for right line
two_x12, two_y12 = [ci[1],ci[1]], [0,3]
'''
print("\n\n\n",ci[0])
print("\n\n\n",ci[1])
'''
plt.title("Gaussian 68% Confidence Interval", fontsize=12, color='black', loc='left', style='italic')
plt.plot(one_x12, one_y12, two_x12, two_y12, marker = 'o')
#plt.show()
results = []
for value in lists:
if(ci[0]< value <ci[1]):
results.append(value)
else:
#print("NOT WANTED: ",value)
pass
return results
df_orig = df.copy()
df_orig[df_orig == np.inf] = np.nan
df_orig[df_orig == -np.inf] = np.nan
def miss_contain_cycles(data):
miss_cycles = []
for i in range(math.ceil(data.shape[0] // 480)):
temp = data[i*480:(i+1)*480]
if np.sum(temp == np.inf) > 0 or np.sum(temp == -np.inf) > 0 or np.sum(np.isnan(temp)) > 0:
miss_cycles.append(i)
return miss_cycles
def missing_stats(data):
inf_stats = np.sum(data == np.inf)
minus_inf_stats = np.sum(data == -np.inf)
nan_stats = np.sum(np.isnan(data))
miss_cycles = miss_contain_cycles(data)
return inf_stats, minus_inf_stats, nan_stats, miss_cycles
dft = pd.read_csv('me_300_SOF.csv', header=None)
df_plot.columns = ['A', 'B' ,'T','S','C','Cycle']
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(20,10), squeeze=False)
df_plot.plot.scatter(ax=ax[0, 0] , alpha=0.8 , x='Cycle', y='A', colormap='coolwarm', c='A') ; ax[0, 0].set_title('A Vs Cycle', fontweight='bold', fontsize=14) ; ax[0, 0].set_ylabel('A')
df_plot.plot.scatter(ax=ax[1, 0] , alpha=0.8 , x='Cycle', y='B', colormap='coolwarm', c='B') ; ax[1, 0].set_title('B Vs Cycle', fontweight='bold', fontsize=14) ; ax[1, 0].set_ylabel('B')
df_plot.plot.scatter(ax=ax[2, 0] , alpha=0.8 , x='Cycle', y='T', colormap='coolwarm', c='T') ; ax[2, 0].set_title('C Vs Cycle', fontweight='bold', fontsize=14) ; ax[2, 0].set_ylabel('T')
plt.suptitle('Exploratory Data Analysis (EDA) ', color='yellow', backgroundcolor='black', fontsize=15, fontweight='bold')
plt.subplots_adjust(top=0.9, bottom=0.07, left=0.06, right=0.96, hspace=0.4, wspace=0.2)
plt.show()
Any help would be greatly appreciated!