there I read some of the threads here, and I am still confused.
I thought the scipy stats (continuous random variable) function, stats.rv_name.pdf(x, loc, scale, *params) should give a sum of 1.
I basically fitted a scatter-plot data using the code below. I do get a cumulative value of 1.0 (eventually). But my pdf_fitted does not sum to one.
I still don't understand why that is, and how I can get the arguments in the pdf output such that it can sum to one.
There is a relevant thread here : Why does scipy.norm.pdf sometimes give PDF > 1? How to correct it?
def py_DistEstimate(arr1, disType, reSults='params', bins = 20):
dist_names = ['gamma', 'beta', 'rayleigh', 'norm', 'pareto']
dist = getattr(stats, disType)
param = dist.fit(arr1)
x = linspace(min(arr1), max(arr1), bins)
pdf_fitted = dist.pdf(x, loc=param[-2], scale=param[-1], *param[:-2])
cdf_fitted = dist.cdf(x, loc=param[-2], scale=param[-1], *param[:-2])
if reSults == 'pdf':
digitizeV = np.digitize(arr1, x, right = True)
bin_counV = np.bincount(digitizeV, weights = None)
bin_probV = bin_counV/len(arr1)
return pd.DataFrame({'x-axis':x, 'pdf':pdf_fitted, 'original':bin_probV, 'cdf':cdf_fitted})
elif reSults == 'params':
parameter_names = [p for p in inspect.signature(dist._pdf).parameters if not p=='x'] + ["loc","scale"]
return pd.DataFrame({'names':parameter_names, 'values':param})
elif reSults == 'listparams':
dist_continu = [d for d in dir(stats) if isinstance(getattr(stats, d), stats.rv_continuous)]
return dist_continu