I am trying to fit the data using distribution fitting in python.
Problem: The probability Density Function (PDF) line on histogram is not complete as shown in the image. Is there a way to get the PDF line for all data?. I'm not sure if I am making mistake in setting the correct axis or scale. For instance 'density=True'
argument correctly or array bin_centers
for my x-axis?
I tried to fix this using existing answer, but I couldnt able to solve the problem.
Test data: Available here
Script I am using:
import pandas as pd
import numpy as np
import scipy
import scipy.stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from numpy import percentile
#%% Test dataset
y=pd.read_csv('Clean.csv',squeeze=True,na_filter=True,header=None,index_col=None)
x = np.arange(len(y))
size = len(y)
df=pd.DataFrame(data=y)
fig = plt.figure(figsize=(9, 3))
plt.hist(y)
plt.show()
# Create an index array (x) for data
#%%
y_std=round(y)
import warnings
warnings.filterwarnings("ignore")
dist_names = ['beta',
'expon',
'exponnorm',
'lognorm',
'pearson3'
]
# Set up empty lists to stroe results
chi_square = []
p_values = []
KS = []
# Set up 50 bins for chi-square test
# Observed data will be approximately evenly distrubuted aross all bins
percentile_bins = np.linspace(0,80,31)
percentile_cutoffs = np.percentile(y_std, percentile_bins)
observed_frequency, bins = (np.histogram(y_std, bins=percentile_cutoffs))
cum_observed_frequency = np.cumsum(observed_frequency)
# Loop through candidate distributions
for distribution in dist_names:
s1 = time()
# Set up distribution and get fitted distribution parameters
dist = getattr(scipy.stats, distribution)
param = dist.fit(y_std)
p = scipy.stats.kstest(y_std, distribution, args=param)[1]
p = np.around(p, 5)
p_values.append(p)
ks = scipy.stats.kstest(y_std, distribution, args=param)
ks = np.around(ks, 5)
KS.append((distribution,ks[0],ks[1]))
# Get expected counts in percentile bins
# This is based on a 'cumulative distrubution function' (cdf)
cdf_fitted = dist.cdf(percentile_cutoffs, *param[:-2], loc=param[-2],
scale=param[-1])
expected_frequency = []
for bin in range(len(percentile_bins)-1):
expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin]
expected_frequency.append(expected_cdf_area)
# calculate chi-squared
expected_frequency = np.array(expected_frequency) * size
cum_expected_frequency = np.cumsum(expected_frequency)
ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency)
chi_square.append(ss)
print(f"chi_square {distribution} time: {time() - s1}")
# Collate results and sort by goodness of fit (best at top)
results = pd.DataFrame()
results['Distribution'] = dist_names
results['chi_square'] = chi_square
results['p_value'] = p_values
results['KS_Test'] = KS
results.sort_values(['chi_square'], inplace=True)
# Report results
print ('\nDistributions sorted by goodness of fit:')
print ('----------------------------------------------------------------------------- ')
print (results)
#%%
# Divide the observed data into 100 bins for plotting (this can be changed)
number_of_bins = 20
bin_cutoffs = np.linspace(np.percentile(y,0), np.percentile(y,99),number_of_bins)
# Create the plot
fig = plt.figure(figsize=(7, 5))
h = plt.hist(y, bins = bin_cutoffs, color='0.75')
# Get the top three distributions from the previous phase
number_distributions_to_plot = 5
dist_names = results['Distribution'].iloc[0:number_distributions_to_plot]
# Create an empty list to stroe fitted distribution parameters
parameters = []
# Loop through the distributions ot get line fit and paraemters
for dist_name in dist_names:
# Set up distribution and store distribution paraemters
dist = getattr(scipy.stats, dist_name)
param = dist.fit(y)
# Separate parts of parameters
parameters.append(param)
# Get line for each distribution (and scale to match observed data)
pdf_fitted = dist.pdf(x, *param[:-2], loc=param[-2], scale=param[-1])
scale_pdf = np.trapz (h[0], h[1][:-1]) / np.trapz (pdf_fitted, x)
pdf_fitted *= scale_pdf
# Add the line to the plot
plt.plot(pdf_fitted, label=dist_name)
# Set the plot x axis to contain 99% of the data
# This can be removed, but sometimes outlier data makes the plot less clear
plt.xlim(0,np.percentile(y,90))
# Add legend and display plot
plt.legend()
plt.show()